diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 1a401997c649518766acb2ebb0dea1c128bd0ba4..2f3df7cda9cec29ed0c2266629022f0a22b37df9 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -4,7 +4,7 @@ https://stackoverflow.com/questions/tagged/tensorflow
 
 If you open a GitHub issue, here is our policy:
 
-1. It must be a bug or a feature request.
+1. It must be a bug, a feature request, or a significant problem with documentation (for small docs fixes please send a PR instead).
 2. The form below must be filled out.
 3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).
 
diff --git a/README.md b/README.md
index 0c93813e584d4e41fe80d50e047069b2dad8311a..c754c3f0db088be8b638c8bc508e1dd765d960d8 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
diff --git a/RELEASE.md b/RELEASE.md
index fdf10407fda21444f1d0ee6cf20650d2659b146f..af6440acef51307dc98cac5b908221411307dda6 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,18 +1,39 @@
 # Release 1.5.0
 
 ## Breaking Changes
-* Prebuilt binaries are now built against CUDA 9 and cuDNN 7.
+* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
 * Our Linux binaries are built using ubuntu 16 containers, potentially
   introducing glibc incompatibility issues with ubuntu 14.
 * Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
   This may break TF on older CPUs.
 
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
 ## Major Features And Improvements
 * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
   preview version is now available.
 * [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
   dev preview is now available.
-* CUDA 9 and cuDNN 7 support.
+* CUDA 9.0 and cuDNN 7 support.
 * Accelerated Linear Algebra (XLA):
   * Add `complex64` support to XLA compiler.
   * `bfloat` support is now added to XLA infrastructure.
diff --git a/configure.py b/configure.py
index cf16ef483763733cc12c838ea92b144c6493f0b1..27519b4aba013e53f688513b798de4238e545022 100644
--- a/configure.py
+++ b/configure.py
@@ -43,6 +43,7 @@ _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
 _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                           'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
+_DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/x86_64-linux-gnu'
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
@@ -297,7 +298,7 @@ def get_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
 
   Returns:
@@ -410,7 +411,7 @@ def set_action_env_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
   """
   var = int(
@@ -959,6 +960,119 @@ def set_tf_cudnn_version(environ_cp):
   write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
 
 
+def set_tf_tensorrt_install_path(environ_cp):
+  """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION.
+
+  Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+  Args:
+    environ_cp: copy of the os.environ.
+
+  Raises:
+    ValueError: if this method was called under non-Linux platform.
+    UserInputError: if user has provided invalid input multiple times.
+  """
+  if not is_linux():
+    raise ValueError('Currently TensorRT is only supported on Linux platform.')
+
+  # Ask user whether to add TensorRT support.
+  if str(int(get_var(
+      environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1':
+    return
+
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
+    ask_tensorrt_path = (r'Please specify the location where TensorRT is '
+                         'installed. [Default is %s]:') % (
+                             _DEFAULT_TENSORRT_PATH_LINUX)
+    trt_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'TENSORRT_INSTALL_PATH', ask_tensorrt_path,
+        _DEFAULT_TENSORRT_PATH_LINUX)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    trt_install_path = os.path.realpath(
+        os.path.expanduser(trt_install_path))
+
+    def find_libs(search_path):
+      """Search for libnvinfer.so in "search_path"."""
+      fl = set()
+      if os.path.exists(search_path) and os.path.isdir(search_path):
+        fl.update([os.path.realpath(os.path.join(search_path, x))
+                   for x in os.listdir(search_path) if 'libnvinfer.so' in x])
+      return fl
+
+    possible_files = find_libs(trt_install_path)
+    possible_files.update(find_libs(os.path.join(trt_install_path, 'lib')))
+    possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64')))
+
+    def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver):
+      """Check the compatibility between tensorrt and cudnn/cudart libraries."""
+      ldd_bin = which('ldd') or '/usr/bin/ldd'
+      ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep)
+      cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
+      cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
+      cudnn = None
+      cudart = None
+      for line in ldd_out:
+        if 'libcudnn.so' in line:
+          cudnn = cudnn_pattern.search(line)
+        elif 'libcudart.so' in line:
+          cudart = cuda_pattern.search(line)
+      if cudnn and len(cudnn.group(1)):
+        cudnn = convert_version_to_int(cudnn.group(1))
+      if cudart and len(cudart.group(1)):
+        cudart = convert_version_to_int(cudart.group(1))
+      return (cudnn == cudnn_ver) and (cudart == cuda_ver)
+
+    cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
+    cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
+    nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$')
+    highest_ver = [0, None, None]
+
+    for lib_file in possible_files:
+      if is_compatible(lib_file, cuda_ver, cudnn_ver):
+        ver_str = nvinfer_pattern.search(lib_file).group(1)
+        ver = convert_version_to_int(ver_str) if len(ver_str) else 0
+        if ver > highest_ver[0]:
+          highest_ver = [ver, ver_str, lib_file]
+    if highest_ver[1] is not None:
+      trt_install_path = os.path.dirname(highest_ver[2])
+      tf_tensorrt_version = highest_ver[1]
+      break
+
+    # Try another alternative from ldconfig.
+    ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
+    ldconfig_output = run_shell([ldconfig_bin, '-p'])
+    search_result = re.search(
+        '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output)
+    if search_result:
+      libnvinfer_path_from_ldconfig = search_result.group(2)
+      if os.path.exists(libnvinfer_path_from_ldconfig):
+        if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver):
+          trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig)
+          tf_tensorrt_version = search_result.group(1)
+          break
+
+    # Reset and Retry
+    print('Invalid path to TensorRT. None of the following files can be found:')
+    print(trt_install_path)
+    print(os.path.join(trt_install_path, 'lib'))
+    print(os.path.join(trt_install_path, 'lib64'))
+    if search_result:
+      print(libnvinfer_path_from_ldconfig)
+
+  else:
+    raise UserInputError('Invalid TF_TENSORRT setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
+  # Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION
+  environ_cp['TENSORRT_INSTALL_PATH'] = trt_install_path
+  write_action_env_to_bazelrc('TENSORRT_INSTALL_PATH', trt_install_path)
+  environ_cp['TF_TENSORRT_VERSION'] = tf_tensorrt_version
+  write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_tensorrt_version)
+
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1240,13 +1354,16 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
+    environ_cp['TF_NEED_TENSORRT'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_TENSORRT'] = '0'
 
   set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
                 'with_jemalloc', True)
@@ -1256,6 +1373,8 @@ def main():
                 'with_hdfs_support', True, 'hdfs')
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
+  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
+                'with_kafka_support', False, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
@@ -1278,6 +1397,8 @@ def main():
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
     set_tf_cuda_version(environ_cp)
     set_tf_cudnn_version(environ_cp)
+    if is_linux():
+      set_tf_tensorrt_install_path(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
 
     set_tf_cuda_clang(environ_cp)
@@ -1332,6 +1453,7 @@ def main():
         'more details.')
   config_info_line('mkl', 'Build with MKL support.')
   config_info_line('monolithic', 'Config for mostly static monolithic build.')
+  config_info_line('tensorrt', 'Build with TensorRT support.')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 660499858e7a2b7dbf57d88ddec263e63937df75..c225cc1a74ca34b818a9e9bed878c9a0d5b22cc0 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -211,6 +211,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_kafka_support",
+    define_values = {"with_kafka_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between platforms and file system libraries not supported on those
 # platforms due to limitations in nested select() statements.
 config_setting(
@@ -370,6 +376,14 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# TODO(laigd): consider removing this option and make TensorRT enabled
+# automatically when CUDA is enabled.
+config_setting(
+    name = "with_tensorrt_support",
+    values = {"define": "with_tensorrt_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 package_group(
     name = "internal",
     packages = [
@@ -535,9 +549,11 @@ filegroup(
         "//tensorflow/contrib/periodic_resample:all_files",
         "//tensorflow/contrib/predictor:all_files",
         "//tensorflow/contrib/py2tf:all_files",
-        "//tensorflow/contrib/py2tf/convert:all_files",
+        "//tensorflow/contrib/py2tf/converters:all_files",
+        "//tensorflow/contrib/py2tf/impl:all_files",
         "//tensorflow/contrib/py2tf/pyct:all_files",
         "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
+        "//tensorflow/contrib/py2tf/utils:all_files",
         "//tensorflow/contrib/quantize:all_files",
         "//tensorflow/contrib/receptive_field:all_files",
         "//tensorflow/contrib/reduce_slice_ops:all_files",
@@ -566,6 +582,7 @@ filegroup(
         "//tensorflow/contrib/tensor_forest/proto:all_files",
         "//tensorflow/contrib/tensorboard:all_files",
         "//tensorflow/contrib/tensorboard/db:all_files",
+        "//tensorflow/contrib/tensorrt:all_files",
         "//tensorflow/contrib/testing:all_files",
         "//tensorflow/contrib/text:all_files",
         "//tensorflow/contrib/tfprof:all_files",
diff --git a/tensorflow/SECURITY.md b/tensorflow/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..074eed2951526d53ab62515b7b869569a9708299
--- /dev/null
+++ b/tensorflow/SECURITY.md
@@ -0,0 +1,239 @@
+# Using TensorFlow Securely
+
+This document discusses how to safely deal with untrusted programs (models or
+model parameters), and input data. Below, we also provide guidelines on how to
+report vulnerabilities in TensorFlow.
+
+## TensorFlow models are programs
+
+TensorFlow's runtime system interprets and executes programs. What machine 
+learning practitioners term
+[**models**](https://developers.google.com/machine-learning/glossary/#model) are
+expressed as programs that TensorFlow executes.  TensorFlow programs are encoded
+as computation
+[**graphs**](https://developers.google.com/machine-learning/glossary/#graph).
+The model's parameters are often stored separately in **checkpoints**.
+
+At runtime, TensorFlow executes the computation graph using the parameters
+provided. Note that the behavior of the computation graph may change
+depending on the parameters provided. TensorFlow itself is not a sandbox. When
+executing the computation graph, TensorFlow may read and write files, send and
+receive data over the network, and even spawn additional processes. All these
+tasks are performed with the permissions of the TensorFlow process. Allowing
+for this flexibility makes for a powerful machine learning platform,
+but it has implications for security.
+
+The computation graph may also accept **inputs**. Those inputs are the
+data you supply to TensorFlow to train a model, or to use a model to run
+inference on the data.
+
+**TensorFlow models are programs, and need to be treated as such from a security
+perspective.** 
+
+## Running untrusted models
+
+As a general rule: **Always** execute untrusted models inside a sandbox (e.g.,
+[nsjail](https://github.com/google/nsjail)). 
+
+There are several ways in which a model could become untrusted. Obviously, if an
+untrusted party supplies TensorFlow kernels, arbitrary code may be executed.
+The same is true if the untrusted party provides Python code, such as the
+Python code that generates TensorFlow graphs.
+
+Even if the untrusted party only supplies the serialized computation
+graph (in form of a `GraphDef`, `SavedModel`, or equivalent on-disk format), the
+set of computation primitives available to TensorFlow is powerful enough that
+you should assume that the TensorFlow process effectively executes arbitrary
+code. One common solution is to whitelist only a few safe Ops. While this is
+possible in theory, we still recommend you sandbox the execution.
+
+It depends on the computation graph whether a user provided checkpoint is safe.
+It is easily possible to create computation graphs in which malicious
+checkpoints can trigger unsafe behavior. For example, consider a graph that
+contains a `tf.cond` depending on the value of a `tf.Variable`. One branch of
+the `tf.cond` is harmless, but the other is unsafe. Since the `tf.Variable` is
+stored in the checkpoint, whoever provides the checkpoint now has the ability to
+trigger unsafe behavior, even though the graph is not under their control.
+
+In other words, graphs can contain vulnerabilities of their own. To allow users
+to provide checkpoints to a model you run on their behalf (e.g., in order to
+compare model quality for a fixed model architecture), you must carefully audit
+your model, and we recommend you run the TensorFlow process in a sandbox.
+
+## Accepting untrusted Inputs
+
+It is possible to write models that are secure in a sense that they can safely
+process untrusted inputs assuming there are no bugs. There are two main reasons
+to not rely on this: first, it is easy to write models which must not be exposed
+to untrusted inputs, and second, there are bugs in any software system of
+sufficient complexity. Letting users control inputs could allow them to trigger
+bugs either in TensorFlow or in dependent libraries.
+
+In general, it is good practice to isolate parts of any system which is exposed
+to untrusted (e.g., user-provided) inputs in a sandbox.
+
+A useful analogy to how any TensorFlow graph is executed is any interpreted
+programming language, such as Python. While it is possible to write secure
+Python code which can be exposed to user supplied inputs (by, e.g., carefully
+quoting and sanitizing input strings, size-checking input blobs, etc.), it is
+very easy to write Python programs which are insecure. Even secure Python code
+could be rendered insecure by a bug in the Python interpreter, or in a bug in a
+Python library used (e.g.,
+[this one](https://www.cvedetails.com/cve/CVE-2017-12852/)).
+
+## Running a TensorFlow server
+
+TensorFlow is a platform for distributed computing, and as such there is a
+TensorFlow server (`tf.train.Server`). **The TensorFlow server is meant for
+internal communication only. It is not built for use in an untrusted network.**
+
+For performance reasons, the default TensorFlow server does not include any
+authorization protocol and sends messages unencrypted. It accepts connections
+from anywhere, and executes the graphs it is sent without performing any checks.
+Therefore, if you run a `tf.train.Server` in your network, anybody with
+access to the network can execute what you should consider arbitrary code with
+the privileges of the process running the `tf.train.Server`.
+
+When running distributed TensorFlow, you must isolate the network in which the
+cluster lives. Cloud providers provide instructions for setting up isolated
+networks, which are sometimes branded as "virtual private cloud." Refer to the
+instructions for
+[GCP](https://cloud.google.com/compute/docs/networks-and-firewalls) and
+[AWS](https://aws.amazon.com/vpc/)) for details.
+
+Note that `tf.train.Server` is different from the server created by
+`tensorflow/serving` (the default binary for which is called `ModelServer`).
+By default, `ModelServer` also has no built-in mechanism for authentication.
+Connecting it to an untrusted network allows anyone on this network to run the
+graphs known to the `ModelServer`. This means that an attacker may run
+graphs using untrusted inputs as described above, but they would not be able to
+execute arbitrary graphs. It is possible to safely expose a `ModelServer`
+directly to an untrusted network, **but only if the graphs it is configured to
+use have been carefully audited to be safe**. 
+
+Similar to best practices for other servers, we recommend running any
+`ModelServer` with appropriate privileges (i.e., using a separate user with
+reduced permisisons). In the spirit of defense in depth, we recommend
+authenticating requests to any TensorFlow server connected to an untrusted
+network, as well as sandboxing the server to minimize the adverse effects of
+any breach.
+
+## Vulnerabilities in TensorFlow
+
+TensorFlow is a large and complex system. It also depends on a large set of
+third party libraries (e.g., `numpy`, `libjpeg-turbo`, PNG parsers, `protobuf`).
+It is possible that TensorFlow or its dependent libraries contain
+vulnerabilities that would allow triggering unexpected or dangerous behavior
+with specially crafted inputs.
+
+### What is a vulnerability?
+
+Given TensorFlow's flexibility, it is possible to specify computation graphs
+which exhibit unexpected or unwanted behaviors. The fact that TensorFlow models
+can perform arbitrary computations means that they may read and write files,
+communicate via the network, produce deadlocks and infinite loops, or run out
+of memory. It is only when these behaviors are outside the specifications of the
+operations involved that such behavior is a vulnerability. 
+
+A `FileWriter` writing a file is not unexpected behavior and therefore is not a
+vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution
+**is** a vulnerability.
+
+This is more subtle from a system perspective. For example, it is easy to cause
+a TensorFlow process to try to allocate more memory than available by specifying
+a computation graph containing an ill-considered `tf.tile` operation. TensorFlow
+should exit cleanly in this case (it would raise an exception in Python, or
+return an error `Status` in C++). However, if the surrounding system is not
+expecting the possibility, such behavior could be used in a denial of service
+attack (or worse). Because TensorFlow behaves correctly, this is not a
+vulnerability in TensorFlow (although it would be a vulnerability of this
+hypothetical system).
+
+As a general rule, it is incorrect behavior for Tensorflow to access memory it
+does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
+such behaviors constitute a vulnerability.
+
+One of the most critical parts of any system is input handling. If malicious
+input can trigger side effects or incorrect behavior, this is a bug, and likely
+a vulnerability.
+
+### Reporting vulnerabilities
+
+Please email reports about any security related issues you find to
+`security@tensorflow.org`. This mail is delivered to a small security team. Your
+email will be acknowledged within one business day, and you'll receive a more
+detailed response to your email within 7 days indicating the next steps in
+handling your report. For critical problems, you may encrypt your report (see
+below).
+
+Please use a descriptive subject line for your report email. After the initial
+reply to your report, the security team will endeavor to keep you informed of
+the progress being made towards a fix and announcement. 
+
+If you believe that an existing (public) issue is security-related, please send
+an email to `security@tensorflow.org`. The email should include the issue ID and
+a short description of why it should be handled according to this security
+policy.
+
+Once an issue is reported, TensorFlow uses the following disclosure process:
+
+* When a report is received, we confirm the issue and determine its severity.
+* If we know of specific third-party services or software based on TensorFlow
+  that require mitigation before publication, those projects will be notified.
+* An advisory is prepared (but not published) which details the problem and
+  steps for mitigation.
+* Wherever possible, fixes are prepared for the last minor release of the two
+  latest major releases, as well as the master branch. We will attempt to
+  commit these fixes as soon as possible, and as close together as
+  possible.
+* Patch releases are published for all fixed released versions, a
+  notification is sent to discuss@tensorflow.org, and the advisory is published.
+
+Past security advisories are listed below. We credit reporters for identifying
+security issues, although we keep your name confidential if you request it.
+
+#### Encryption key for `security@tensorflow.org`
+
+If your disclosure is extremely sensitive, you may choose to encrypt your
+report using the key below. Please only use this for critical security
+reports.
+
+```
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQENBFpqdzwBCADTeAHLNEe9Vm77AxhmGP+CdjlY84O6DouOCDSq00zFYdIU/7aI
+LjYwhEmDEvLnRCYeFGdIHVtW9YrVktqYE9HXVQC7nULU6U6cvkQbwHCdrjaDaylP
+aJUXkNrrxibhx9YYdy465CfusAaZ0aM+T9DpcZg98SmsSml/HAiiY4mbg/yNVdPs
+SEp/Ui4zdIBNNs6at2gGZrd4qWhdM0MqGJlehqdeUKRICE/mdedXwsWLM8AfEA0e
+OeTVhZ+EtYCypiF4fVl/NsqJ/zhBJpCx/1FBI1Uf/lu2TE4eOS1FgmIqb2j4T+jY
+e+4C8kGB405PAC0n50YpOrOs6k7fiQDjYmbNABEBAAG0LVRlbnNvckZsb3cgU2Vj
+dXJpdHkgPHNlY3VyaXR5QHRlbnNvcmZsb3cub3JnPokBTgQTAQgAOBYhBEkvXzHm
+gOJBnwP4Wxnef3wVoM2yBQJaanc8AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA
+AAoJEBnef3wVoM2yNlkIAICqetv33MD9W6mPAXH3eon+KJoeHQHYOuwWfYkUF6CC
+o+X2dlPqBSqMG3bFuTrrcwjr9w1V8HkNuzzOJvCm1CJVKaxMzPuXhBq5+DeT67+a
+T/wK1L2R1bF0gs7Pp40W3np8iAFEh8sgqtxXvLGJLGDZ1Lnfdprg3HciqaVAiTum
+HBFwszszZZ1wAnKJs5KVteFN7GSSng3qBcj0E0ql2nPGEqCVh+6RG/TU5C8gEsEf
+3DX768M4okmFDKTzLNBm+l08kkBFt+P43rNK8dyC4PXk7yJa93SmS/dlK6DZ16Yw
+2FS1StiZSVqygTW59rM5XNwdhKVXy2mf/RtNSr84gSi5AQ0EWmp3PAEIALInfBLR
+N6fAUGPFj+K3za3PeD0fWDijlC9f4Ety/icwWPkOBdYVBn0atzI21thPRbfuUxfe
+zr76xNNrtRRlbDSAChA1J5T86EflowcQor8dNC6fS+oHFCGeUjfEAm16P6mGTo0p
+osdG2XnnTHOOEFbEUeWOwR/zT0QRaGGknoy2pc4doWcJptqJIdTl1K8xyBieik/b
+nSoClqQdZJa4XA3H9G+F4NmoZGEguC5GGb2P9NHYAJ3MLHBHywZip8g9oojIwda+
+OCLL4UPEZ89cl0EyhXM0nIAmGn3Chdjfu3ebF0SeuToGN8E1goUs3qSE77ZdzIsR
+BzZSDFrgmZH+uP0AEQEAAYkBNgQYAQgAIBYhBEkvXzHmgOJBnwP4Wxnef3wVoM2y
+BQJaanc8AhsMAAoJEBnef3wVoM2yX4wIALcYZbQhSEzCsTl56UHofze6C3QuFQIH
+J4MIKrkTfwiHlCujv7GASGU2Vtis5YEyOoMidUVLlwnebE388MmaJYRm0fhYq6lP
+A3vnOCcczy1tbo846bRdv012zdUA+wY+mOITdOoUjAhYulUR0kiA2UdLSfYzbWwy
+7Obq96Jb/cPRxk8jKUu2rqC/KDrkFDtAtjdIHh6nbbQhFuaRuWntISZgpIJxd8Bt
+Gwi0imUVd9m9wZGuTbDGi6YTNk0GPpX5OMF5hjtM/objzTihSw9UN+65Y/oSQM81
+v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
+=CDME
+-----END PGP PUBLIC KEY BLOCK-----
+```
+
+### Known vulnerabilities
+
+| Type | Versions affected | Reported by | Additional Information |
+|------|:-----------------:|---------------------------------------|
+| out of bounds read| <=1.4 | @zhangbo5891001 | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 74190cb135ac6c17bfcc9d8bd2f7c75ac5e8c076..e62310d811462f88af93505393b622d9a87c72d3 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -46,6 +46,7 @@ tf_cuda_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index a76c8f5ec05fc3199addc67857d7bb2ea0e263c2..fd6cecd77ba59011e252171e8f079d41ea0bd345 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -85,15 +85,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
     return nullptr;
   }
 
-  TFE_Context* ret = new TFE_Context(session);
-  ret->policy = opts->policy;
-  ret->pflr.reset(new tensorflow::ProcessFunctionLibraryRuntime(
-      ret->session->device_mgr, opts->session_options.options.env,
-      TF_GRAPH_DEF_VERSION, &ret->func_lib_def, {}));
-  ret->rendezvous =
-      new tensorflow::IntraProcessRendezvous(ret->session->device_mgr);
-
-  return ret;
+  return new TFE_Context(*opts, session);
 }
 
 void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) {
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index a6f76c732f2a4c2402a27cd69c101d028dbb8fcc..dda68471a82f444cde5e74a38423ae5c97b1d7d0 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/version.h"
 
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
@@ -43,9 +44,15 @@ struct TFE_ContextOptions {
 };
 
 struct TFE_Context {
-  explicit TFE_Context(TF_Session* s) : session(s) {}
+  explicit TFE_Context(const TFE_ContextOptions& opts, TF_Session* s)
+      : policy(opts.policy),
+        session(s),
+        rendezvous(new tensorflow::IntraProcessRendezvous(s->device_mgr)),
+        pflr(new tensorflow::ProcessFunctionLibraryRuntime(
+            session->device_mgr, opts.session_options.options.env,
+            TF_GRAPH_DEF_VERSION, &func_lib_def, {})) {}
 
-  TFE_ContextDevicePlacementPolicy policy;
+  const TFE_ContextDevicePlacementPolicy policy;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
   // thread-local-object-local variable in C++11.
@@ -54,8 +61,8 @@ struct TFE_Context {
       thread_local_policies GUARDED_BY(policy_map_mu);
 
   // TFE_Context is an extension of TF_Session. And TF_Session needs a TF_Graph.
-  TF_Session* session;
-  tensorflow::Rendezvous* rendezvous;
+  TF_Session* const session;
+  tensorflow::Rendezvous* const rendezvous;
 
   tensorflow::mutex functions_mu;
   tensorflow::FunctionLibraryDefinition func_lib_def GUARDED_BY(functions_mu){
@@ -64,14 +71,14 @@ struct TFE_Context {
   // One FunctionLibraryRuntime per device.
   // func_libs[i] is the FunctionLibraryRuntime corresponding to
   // session->devices[i].
-  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
+  const std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
 
   tensorflow::mutex cache_mu;
   std::unordered_map<tensorflow::Fprint128, tensorflow::KernelAndDevice*,
                      tensorflow::Fprint128Hasher>
       kernel_cache GUARDED_BY(cache_mu);
 
-  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) {
+  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) const {
     return pflr->GetFLR(d->name());
   }
 
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 2b65e38f54090af6731685f78d5f7f914a875e3c..bdb0815d6b68444ec1c89b835d563db20ce4d8a1 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -18,12 +18,12 @@ limitations under the License.
 // Language-agnostic gradient tape. Does not perform backpropagation, just
 // maintains the data structures required to do so.
 
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -54,11 +54,11 @@ struct OpTapeEntry {
 // Map from tensor_id to internally-defined operation-id of the operation which
 // produced this tensor. A value of -1 means that the tensor was directly
 // watched and not the result of any operation in the tape.
-using TensorTape = std::unordered_map<int64, int64>;
+using TensorTape = gtl::FlatMap<int64, int64>;
 
 // Map from operation-id to tape entry.
 template <typename BackwardFunction>
-using OpTape = std::unordered_map<int64, OpTapeEntry<BackwardFunction>>;
+using OpTape = gtl::FlatMap<int64, OpTapeEntry<BackwardFunction>>;
 
 // Operations the tape needs to perform on tensors to do backpropagation. Named
 // "vspace" because a subset of these are related to a vector space, such as
@@ -159,7 +159,7 @@ class GradientTape {
 
   // Map from tensor id to number of remaining usages (i.e. how many entries in
   // the tape refer to it); to aid in tape garbage collection.
-  std::unordered_map<int64, int64> tensor_usage_;
+  gtl::FlatMap<int64, int64> tensor_usage_;
 
   // If false, all activations are deleted in the first call to ComputeGradient.
   // Else, only when this is destructed.
@@ -286,11 +286,11 @@ struct BackpropInitialState {
 
   // Map from tensor ID to how many references still exist for this tensor in
   // the tape.
-  std::unordered_map<int64, int64> tensor_usage_counts;
+  gtl::FlatMap<int64, int64> tensor_usage_counts;
 
   // Maps from op ID to how many output tensors of this op still need to have
   // their gradients computed.
-  std::unordered_map<int64, int64> op_missing_tensor;
+  gtl::FlatMap<int64, int64> op_missing_tensor;
 };
 
 // If `persistent_tape` is true, op_tape is not changed and none of the
@@ -301,8 +301,8 @@ struct BackpropInitialState {
 template <typename BackwardFunction>
 BackpropInitialState<BackwardFunction> PrepareBackprop(
     gtl::ArraySlice<int64> target, const TensorTape& tensor_tape,
-    OpTape<BackwardFunction>* op_tape,
-    const std::unordered_set<int64>& sources_set, bool persistent_tape) {
+    OpTape<BackwardFunction>* op_tape, const gtl::FlatSet<int64>& sources_set,
+    bool persistent_tape) {
   std::vector<int64> tensor_stack;
   tensor_stack.reserve(target.size());
   for (auto t : target) {
@@ -362,7 +362,7 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
 template <typename BackwardFunction>
 std::vector<int64> InitialStack(
     const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& op_missing_tensor) {
+    const gtl::FlatMap<int64, int64>& op_missing_tensor) {
   std::vector<int64> result;
   for (auto& op_entry : op_tape) {
     if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) {
@@ -373,13 +373,13 @@ std::vector<int64> InitialStack(
 }
 
 template <typename Gradient, typename BackwardFunction>
-Status InitialGradients(
-    const VSpace<Gradient, BackwardFunction>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
-    const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& tensor_usage_counts,
-    std::unordered_map<int64, std::vector<Gradient*>>* result) {
+Status InitialGradients(const VSpace<Gradient, BackwardFunction>& vspace,
+                        gtl::ArraySlice<int64> target_tensor_ids,
+                        gtl::ArraySlice<Gradient*> output_gradients,
+                        const TensorTape& tensor_tape,
+                        const OpTape<BackwardFunction>& op_tape,
+                        const gtl::FlatMap<int64, int64>& tensor_usage_counts,
+                        gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
   for (int i = 0; i < target_tensor_ids.size(); ++i) {
     const int64 id = target_tensor_ids[i];
     if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
@@ -441,13 +441,13 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     gtl::ArraySlice<int64> source_tensor_ids,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
-  std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
-                                        source_tensor_ids.end());
+  gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
+                                  source_tensor_ids.end());
   BackpropInitialState<BackwardFunction> state = PrepareBackprop(
       target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_);
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
-  std::unordered_map<int64, std::vector<Gradient*>> gradients;
+  gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
   Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
                               tensor_tape_, state.op_tape,
                               state.tensor_usage_counts, &gradients);
@@ -463,7 +463,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     cleanup();
     return s;
   }
-  std::unordered_map<int64, int64> gradients_size;
+  gtl::FlatMap<int64, int64> gradients_size;
   // TODO(apassos) multiple threads could be dequeuing from op_stack at the same
   // time, for better CPU backprop performance.
   VLOG(1) << "Initial stack:";
@@ -472,11 +472,10 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       VLOG(1) << "  " << t;
     }
   }
-  std::unordered_map<string, std::unordered_set<int>>
-      functions_accept_none_for_indices({
-          {"SoftmaxCrossEntropyWithLogits", {1}},
-          {"FusedBatchNorm", {1, 2, 3, 4}},
-      });
+  gtl::FlatMap<string, gtl::FlatSet<int>> functions_accept_none_for_indices({
+      {"SoftmaxCrossEntropyWithLogits", {1}},
+      {"FusedBatchNorm", {1, 2, 3, 4}},
+  });
   while (!op_stack.empty()) {
     const int64 op = op_stack.back();
     VLOG(1) << "Popped " << op;
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index c9ade5fb83ff5b80a62bc960d1af1dc55f458c4e..9060c19e9d2cf965c2b9be07be07c42017da45a8 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -433,6 +433,7 @@ tf_gen_op_wrappers_cc(
         "linalg_ops",
         "logging_ops",
         "lookup_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index 0a7c37383f96ca65bf5ae05cf0827c01dc4d799b..97f66e79b8ad9f383b22f56e9385fc6d2080e1f8 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -23,7 +23,6 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 2b9c83ba149adf9e089786b91039e256216579c8..58572fea3db5599cc282944e15c866dcf5f25de0 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -4,7 +4,7 @@
 
 To use from your BUILD file, add the following line to load the macro:
 
-load("@org_tensorflow//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 Then call the macro like this:
 
@@ -16,14 +16,15 @@ tf_library(
 )
 """
 
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "if_android", "tf_copts")
+load("//tensorflow:tensorflow.bzl",
+     "if_android", "tf_cc_test", "tf_copts")
 
 def tf_library(name, graph, config,
                freeze_checkpoint=None, freeze_saver=None,
                cpp_class=None, gen_test=True, gen_benchmark=True,
                visibility=None, testonly=None,
                tfcompile_flags=None,
-               tfcompile_tool="@org_tensorflow//tensorflow/compiler/aot:tfcompile",
+               tfcompile_tool="//tensorflow/compiler/aot:tfcompile",
                include_standard_runtime_deps=True, deps=None, tags=None):
   """Runs tfcompile to compile a TensorFlow graph into executable code.
 
@@ -119,9 +120,9 @@ def tf_library(name, graph, config,
             out_nodes_file,
         ] + freeze_saver_srcs,
         outs=[freeze_file],
-        cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" +
+        cmd=("$(location //tensorflow/python/tools:freeze_graph)" +
              freeze_args),
-        tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"],
+        tools=["//tensorflow/python/tools:freeze_graph"],
         tags=tags,
     )
     tfcompile_graph = freeze_file
@@ -213,22 +214,22 @@ def tf_library(name, graph, config,
           # These deps are required by all tf_library targets even if
           # include_standard_runtime_deps is False.  Without them, the
           # generated code will fail to compile.
-          "@org_tensorflow//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
-          "@org_tensorflow//tensorflow/core:framework_lite",
+          "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+          "//tensorflow/core:framework_lite",
       ] + (need_xla_data_proto and [
           # If we're generating the program shape, we must depend on the proto.
-          "@org_tensorflow//tensorflow/compiler/xla:xla_data_proto",
+          "//tensorflow/compiler/xla:xla_data_proto",
       ] or []) + (include_standard_runtime_deps and [
           # TODO(cwhipkey): only depend on kernel code that the model actually needed.
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_matmul",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
+          "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_matmul",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
           "//third_party/eigen3",
       ] or []) + (deps or []),
       tags=tags,
@@ -254,28 +255,32 @@ def tf_library(name, graph, config,
         name=("gen_" + test_name),
         testonly=1,
         srcs=[
-            "@org_tensorflow//tensorflow/compiler/aot:test.cc",
+            "//tensorflow/compiler/aot:test.cc",
             header_file,
         ],
         outs=[test_file],
         cmd=("sed " + sed_replace +
-             " $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " +
+             " $(location //tensorflow/compiler/aot:test.cc) " +
              "> $(OUTS)"),
         tags=tags,
     )
 
-    # The cc_test rule for the generated code.
-    native.cc_test(
+    # The cc_test rule for the generated code.  To ensure that this works
+    # reliably across build configurations, we must use tf_cc_test instead of
+    # native.cc_test.  This is related to how we build
+    # //tensorflow/core:lib -- see the note in tensorflow/core/BUILD
+    # for more details.
+    tf_cc_test(
         name=test_name,
         srcs=[test_file],
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/aot:tf_library_test_main",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/aot:tf_library_test_main",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
-            "@org_tensorflow//tensorflow/core:lib",
-            "@org_tensorflow//tensorflow/core:test",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
             ],
         tags=tags,
     )
@@ -283,7 +288,7 @@ def tf_library(name, graph, config,
   if gen_benchmark:
     benchmark_name = name + "_benchmark"
     benchmark_file = benchmark_name + ".cc"
-    benchmark_main = ("@org_tensorflow//tensorflow/compiler/aot:" +
+    benchmark_main = ("//tensorflow/compiler/aot:" +
                       "benchmark_main.template")
 
     # Rule to rewrite benchmark.cc to produce the benchmark_file.
@@ -301,7 +306,9 @@ def tf_library(name, graph, config,
         tags=tags,
     )
 
-    # The cc_benchmark rule for the generated code.
+    # The cc_benchmark rule for the generated code.  This does not need the
+    # tf_cc_binary since we (by deliberate design) do not depend on
+    # //tensorflow/core:lib.
     #
     # Note: to get smaller size on android for comparison, compile with:
     #    --copt=-fvisibility=hidden
@@ -315,12 +322,12 @@ def tf_library(name, graph, config,
         linkopts = if_android(["-pie", "-s"]),
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark",
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:benchmark",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
         ] + if_android([
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark_extra_android",
+            "//tensorflow/compiler/aot:benchmark_extra_android",
         ]),
         tags=tags,
     )
@@ -330,11 +337,11 @@ def target_llvm_triple():
   # TODO(toddw): Add target_triple for other targets.  For details see:
   # http://llvm.org/docs/doxygen/html/Triple_8h_source.html
   return select({
-      "@org_tensorflow//tensorflow:android_armeabi": "armv5-none-android",
-      "@org_tensorflow//tensorflow:android_arm": "armv7-none-android",
-      "@org_tensorflow//tensorflow:android_arm64": "aarch64-none-android",
-      "@org_tensorflow//tensorflow:android_x86": "i686-none-android",
-      "@org_tensorflow//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
-      "@org_tensorflow//tensorflow:darwin": "x86_64-none-darwin",
+      "//tensorflow:android_armeabi": "armv5-none-android",
+      "//tensorflow:android_arm": "armv7-none-android",
+      "//tensorflow:android_arm64": "aarch64-none-android",
+      "//tensorflow:android_x86": "i686-none-android",
+      "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
+      "//tensorflow:darwin": "x86_64-none-darwin",
       "//conditions:default": "x86_64-pc-linux",
   })
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 4842877d9af332bdaa4a142867dde89ba66bd9a2..17ae2bb25cac94cee0a1f3df66edf1b3a404e3ec 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -45,7 +45,7 @@ namespace tensorflow {
 // see comment on `AllowsAsynchronousDeallocation()`.
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
-  XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context);
+  XlaAllocator(const gpu::Platform* platform, OpKernelContext* op_context);
   ~XlaAllocator() override;
   xla::StatusOr<gpu::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
                                                 bool retry_on_failure) override;
@@ -79,7 +79,8 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
   std::unordered_map<void*, Tensor> tensors_;
 };
 
-XlaAllocator::XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context)
+XlaAllocator::XlaAllocator(const gpu::Platform* platform,
+                           OpKernelContext* op_context)
     : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
 
 XlaAllocator::~XlaAllocator() = default;
@@ -248,12 +249,16 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
+  // Builds an XLA allocator for the device.
+  XlaAllocator xla_allocator(client->platform(), ctx);
+
   XlaCompiler::Options options;
   options.client = client;
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.device_allocator = &xla_allocator;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -264,9 +269,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   VLOG(1) << "Executing XLA Computation...";
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
-
   std::unique_ptr<xla::ShapedBuffer> output;
   // Build xla::ShapedBuffers that point directly to the Tensor buffers.
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index bfff52c55a7d5a4490224347019db9b3333f7e2e..21d3a54f1b8ea59c3da09d8c5d626a9f5bcebbc9 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -223,6 +223,7 @@ Status XlaCompilationCache::BuildExecutable(
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client_->default_device_ordinal());
   build_options.set_result_layout(result.xla_output_shape);
+  build_options.set_device_allocator(options.device_allocator);
 
   auto compile_result =
       client_->Compile(*result.computation, argument_layouts, build_options);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 314f5506b16e2c28736d9d39aa6c856d50885108..b0b038775f737e4932fe46e640a6c3fa875973b9 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -144,6 +144,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_triangular_solve_op_test",
+    size = "small",
+    srcs = ["matrix_triangular_solve_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "clustering_test",
     size = "small",
@@ -240,6 +255,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "extract_image_patches_op_test",
+    size = "small",
+    srcs = ["extract_image_patches_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "fft_test",
     size = "medium",
@@ -326,6 +353,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_band_part_test",
+    size = "medium",
+    srcs = ["matrix_band_part_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "momentum_test",
     size = "small",
@@ -437,6 +477,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reverse_sequence_op_test",
+    size = "small",
+    srcs = ["reverse_sequence_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "rmsprop_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 16856bd736ed408da29c3199c4593eb578775128..30a6d3a74d64f90ad33062df6d1e16e3a575bd63 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -774,15 +774,15 @@ class BinaryOpsTest(XLATestCase):
   def DISABLED_testSparseMatMul(self):
     # Binary wrappers for sparse_matmul with different hints
     def SparseMatmulWrapperTF(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True)
 
     def SparseMatmulWrapperFT(a, b):
-      return tf.sparse_matmul(a, b, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, b_is_sparse=True)
 
     def SparseMatmulWrapperTT(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
 
-    self._testMatMul(tf.sparse_matmul)
+    self._testMatMul(math_ops.sparse_matmul)
     self._testMatMul(SparseMatmulWrapperTF)
     self._testMatMul(SparseMatmulWrapperFT)
     self._testMatMul(SparseMatmulWrapperTT)
@@ -1181,6 +1181,50 @@ class BinaryOpsTest(XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
+  def testMatrixSetDiag(self):
+    for dtype in self.numeric_types:
+      # Square
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]],
+                   dtype=dtype),
+          np.array([1.0, 2.0, 3.0], dtype=dtype),
+          expected=np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]],
+                            dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]],
+                   dtype=dtype),
+          np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]], dtype=dtype),
+          expected=np.array(
+              [[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], [1.0, 0.0, -3.0]],
+               [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]],
+              dtype=dtype))
+
+      # Rectangular
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]], dtype=dtype),
+          np.array([[-1.0, -2.0], [-4.0, -5.0]],
+                   dtype=dtype),
+          expected=np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
+                             [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]],
+                            dtype=dtype))
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0361702e7af778176daed941d64e61198090daf2
--- /dev/null
+++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for ExtractImagePatches op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ExtractImagePatches(XLATestCase):
+  """Functional tests for ExtractImagePatches op."""
+
+  def _VerifyValues(self, image, ksizes, strides, rates, padding, patches):
+    """Tests input-output pairs for the ExtractImagePatches op.
+
+    Args:
+      image: Input tensor with shape: [batch, in_rows, in_cols, depth].
+      ksizes: Patch size specified as: [ksize_rows, ksize_cols].
+      strides: Output strides, specified as [stride_rows, stride_cols].
+      rates: Atrous rates, specified as [rate_rows, rate_cols].
+      padding: Padding type.
+      patches: Expected output.
+    """
+    ksizes = [1] + ksizes + [1]
+    strides = [1] + strides + [1]
+    rates = [1] + rates + [1]
+
+    with self.test_session():
+      image_placeholder = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        out_tensor = array_ops.extract_image_patches(
+            image_placeholder,
+            ksizes=ksizes,
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name="im2col")
+      feed_dict = {image_placeholder: image}
+      self.assertAllClose(patches, out_tensor.eval(feed_dict=feed_dict))
+
+  def testKsize1x1Stride1x1Rate1x1(self):
+    """Verifies that for 1x1 kernel the output equals the input."""
+    # [2, 3, 4, 5]
+    image = np.reshape(range(120), [2, 3, 4, 5])
+    # [2, 3, 4, 5]
+    patches = np.reshape(range(120), [2, 3, 4, 5])
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[1, 1],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize1x1Stride2x3Rate1x1(self):
+    """Test for 1x1 kernel and strides."""
+    # [2, 4, 5, 3]
+    image = np.reshape(range(120), [2, 4, 5, 3])
+    # [2, 2, 2, 3]
+    patches = image[:, ::2, ::3, :]
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[2, 3],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Valid(self):
+    """Test for 2x2 kernel with VALID padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 1, 1, 4]
+    patches = [[[[1, 2, 3, 4]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="VALID",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Same(self):
+    """Test for 2x2 kernel with SAME padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 2, 2, 4]
+    patches = [[[[1, 2, 3, 4], [2, 0, 4, 0]], [[3, 4, 0, 0], [4, 0, 0, 0]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="SAME",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..29394f9ea5139b30f88f53de0469b27e37d79195
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_band_part_test.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class MatrixBandPartTest(XLATestCase):
+
+  def _testMatrixBandPart(self, dtype, shape):
+    with self.test_session():
+      batch_shape = shape[:-2]
+      mat = np.ones(shape).astype(dtype)
+      batch_mat = np.tile(mat, batch_shape + [1, 1])
+      for lower in -1, 0, 1, shape[-2] - 1:
+        for upper in -1, 0, 1, shape[-1] - 1:
+          band_np = mat
+          if lower >= 0:
+            band_np = np.triu(band_np, -lower)
+          if upper >= 0:
+            band_np = np.tril(band_np, upper)
+          if batch_shape:
+            band_np = np.tile(band_np, batch_shape + [1, 1])
+
+          placeholder = array_ops.placeholder(dtype)
+          with self.test_scope():
+            band = array_ops.matrix_band_part(
+                placeholder,
+                constant_op.constant(lower, dtype=dtypes.int32),
+                constant_op.constant(upper, dtype=dtypes.int32))
+            feed_dict = {placeholder: batch_mat}
+            self.assertAllEqual(band_np, band.eval(feed_dict=feed_dict))
+
+  def testMatrixBandPart(self):
+    for dtype in self.float_types:
+      for batch_shape in [[], [2,], [1, 3, 2]]:
+        for rows in 1, 2, 7:
+          for cols in 1, 2, 7:
+            self._testMatrixBandPart(dtype, batch_shape + [rows, cols])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cccb7f5789dce39ef8c3d4b3a7573aaa983b3fbd
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.MatrixTriangularSolve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def MakePlaceholder(x):
+  return array_ops.placeholder(dtypes.as_dtype(x.dtype), shape=x.shape)
+
+
+class MatrixTriangularSolveOpTest(XLATestCase):
+
+  def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
+                                 placeholder_b, a, clean_a, b, verification,
+                                 atol):
+    feed_dict = {placeholder_a: a, placeholder_ca: clean_a, placeholder_b: b}
+    verification_np = sess.run(verification, feed_dict)
+    self.assertAllClose(b, verification_np, atol=atol)
+
+  def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol):
+    clean_a = np.tril(a) if lower else np.triu(a)
+    with self.test_session() as sess:
+      placeholder_a = MakePlaceholder(a)
+      placeholder_ca = MakePlaceholder(clean_a)
+      placeholder_b = MakePlaceholder(b)
+      with self.test_scope():
+        x = linalg_ops.matrix_triangular_solve(
+            placeholder_a, placeholder_b, lower=lower, adjoint=adjoint)
+      verification = math_ops.matmul(placeholder_ca, x, adjoint_a=adjoint)
+      self._VerifyTriangularSolveBase(sess, placeholder_a, placeholder_ca,
+                                      placeholder_b, a, clean_a, b,
+                                      verification, atol)
+
+  def _VerifyTriangularSolveCombo(self, a, b, atol=1e-4):
+    transp = lambda x: np.swapaxes(x, -1, -2)
+    for lower, adjoint in itertools.product([True, False], repeat=2):
+      self._VerifyTriangularSolve(
+          a if lower else transp(a), b, lower, adjoint, atol)
+
+  def testBasic(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5))
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicNotActuallyTriangular(self):
+    rng = np.random.RandomState(0)
+    a = rng.randn(5, 5)  # the `a` matrix is not lower-triangular
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicComplexDtypes(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5) + rng.randn(5, 5) * 1j)
+    b = rng.randn(5, 7) + rng.randn(5, 7) * 1j
+    for dtype in self.complex_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBatch(self):
+    rng = np.random.RandomState(0)
+    shapes = [((4, 3, 3), (4, 3, 5)), ((1, 2, 2), (1, 2, 1)),
+              ((1, 1, 1), (1, 1, 2)), ((2, 3, 4, 4), (2, 3, 4, 1))]
+    tuples = itertools.product(self.float_types, shapes)
+    for dtype, (a_shape, b_shape) in tuples:
+      n = a_shape[-1]
+      a = np.tril(rng.rand(*a_shape) - 0.5) / (2.0 * n) + np.eye(n)
+      b = rng.randn(*b_shape)
+      self._VerifyTriangularSolveCombo(
+          a.astype(dtype), b.astype(dtype), atol=1e-3)
+
+  def testLarge(self):
+    n = 1024
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.rand(n, n) - 0.5) / (2.0 * n) + np.eye(n)
+    b = rng.randn(n, n)
+    self._VerifyTriangularSolve(
+        a.astype(np.float32), b.astype(np.float32), True, False, 1e-4)
+
+  def testNonSquareCoefficientMatrix(self):
+    rng = np.random.RandomState(0)
+    for dtype in self.float_types:
+      a = rng.randn(3, 4).astype(dtype)
+      b = rng.randn(4, 4).astype(dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+
+  def testWrongDimensions(self):
+    randn = np.random.RandomState(0).randn
+    for dtype in self.float_types:
+      lhs = constant_op.constant(randn(3, 3), dtype=dtype)
+      rhs = constant_op.constant(randn(4, 3), dtype=dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/reverse_sequence_op_test.py b/tensorflow/compiler/tests/reverse_sequence_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a5d05094e53cfecd9476d7d87f023e8a02d7458
--- /dev/null
+++ b/tensorflow/compiler/tests/reverse_sequence_op_test.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.reverse_sequence_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ReverseSequenceTest(XLATestCase):
+
+  def _testReverseSequence(self,
+                           x,
+                           batch_axis,
+                           seq_axis,
+                           seq_lengths,
+                           truth,
+                           expected_err_re=None):
+    with self.test_session():
+      p = array_ops.placeholder(dtypes.as_dtype(x.dtype))
+      lengths = array_ops.placeholder(dtypes.as_dtype(seq_lengths.dtype))
+      with self.test_scope():
+        ans = array_ops.reverse_sequence(
+            p, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=lengths)
+      if expected_err_re is None:
+        tf_ans = ans.eval(feed_dict={p: x, lengths: seq_lengths})
+        self.assertAllClose(tf_ans, truth, atol=1e-10)
+      else:
+        with self.assertRaisesOpError(expected_err_re):
+          ans.eval(feed_dict={p: x, lengths: seq_lengths})
+
+  def testSimple(self):
+    x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
+    expected = np.array([[1, 2, 3], [6, 5, 4], [8, 7, 9]], dtype=np.int32)
+    self._testReverseSequence(
+        x,
+        batch_axis=0,
+        seq_axis=1,
+        seq_lengths=np.array([1, 3, 2], np.int32),
+        truth=expected)
+
+  def _testBasic(self, dtype, len_dtype):
+    x = np.asarray(
+        [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
+         [[17, 18, 19, 20], [21, 22, 23, 24]]],
+        dtype=dtype)
+    x = x.reshape(3, 2, 4, 1, 1)
+    x = x.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    # reverse dim 2 up to (0:3, none, 0:4) along dim=0
+    seq_lengths = np.asarray([3, 0, 4], dtype=len_dtype)
+
+    truth_orig = np.asarray(
+        [
+            [[3, 2, 1, 4], [7, 6, 5, 8]],  # reverse 0:3
+            [[9, 10, 11, 12], [13, 14, 15, 16]],  # reverse none
+            [[20, 19, 18, 17], [24, 23, 22, 21]]
+        ],  # reverse 0:4 (all)
+        dtype=dtype)
+    truth_orig = truth_orig.reshape(3, 2, 4, 1, 1)
+    truth = truth_orig.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    seq_axis = 0  # permute seq_axis and batch_axis (originally 2 and 0, resp.)
+    batch_axis = 2
+    self._testReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth)
+
+  def testSeqLength(self):
+    for dtype in self.all_types:
+      for seq_dtype in self.int_types:
+        self._testBasic(dtype, seq_dtype)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 8e4b8a38336c5e8b2e10edc4c81502eeebb628d2..3d3e112f4821ea8e57ea9589a5b4433647ad294b 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -154,6 +154,21 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      x = np.arange(-0.90, 0.90, 0.25)
+      self._assertOpOutputMatchesExpected(
+          math_ops.acos,
+          x.astype(dtype),
+          expected=np.arccos(x).astype(dtype))
+      self._assertOpOutputMatchesExpected(
+          math_ops.asin,
+          x.astype(dtype),
+          expected=np.arcsin(x).astype(dtype))
+      x = np.arange(-3, 3).reshape(1, 3, 2)
+      self._assertOpOutputMatchesExpected(
+          math_ops.atan,
+          x.astype(dtype),
+          expected=np.arctan(x).astype(dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.acosh,
           np.array([1, 2, 3, 4], dtype=dtype),
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
index 82b3b46a2f1e97001d1e0c6b993ec243170bc7d8..91351421bcacd26c41b5c9f98ea833730e4aef30 100644
--- a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -6,6 +6,9 @@ Operator                              | Type Constraint
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
 `All`                                 | `Tidx={int32,int64}`
 `Angle`                               | `Tout={double,float}`<br>`T={complex64}`
 `Any`                                 | `Tidx={int32,int64}`
@@ -34,7 +37,7 @@ Operator                              | Type Constraint
 `BroadcastGradientArgs`               | `T={int32,int64}`
 `Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Ceil`                                | `T={double,float}`
-`Cholesky`                            | `T={complex64,double,float}`
+`Cholesky`                            | `T={double,float}`
 `Complex`                             | `Tout={complex64}`<br>`T={double,float}`
 `ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
 `Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -68,7 +71,11 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
-`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
 `FloorMod`                            | `T={double,float,int32,int64}`
@@ -80,6 +87,13 @@ Operator                              | Type Constraint
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
 `Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Imag`                                | `Tout={double,float}`<br>`T={complex64}`
@@ -105,11 +119,14 @@ Operator                              | Type Constraint
 `MatMul`                              | `T={complex64,double,float}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
 `Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -131,6 +148,10 @@ Operator                              | Type Constraint
 `PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
 `RandomStandardNormal`                | `dtype={float}`
 `RandomUniform`                       | `T={int32,int64}`<br>`dtype={double,float}`
 `RandomUniformInt`                    | `T={int32,int64}`<br>`Tout={int32,int64}`
@@ -146,6 +167,8 @@ Operator                              | Type Constraint
 `Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
 `ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
 `Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
 `ResourceApplyAdagrad`                | `T={double,float}`
 `ResourceApplyAdam`                   | `T={double,float}`
 `ResourceApplyFtrl`                   | `T={double,float}`
@@ -156,6 +179,7 @@ Operator                              | Type Constraint
 `ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
 `RightShift`                          | `T={int32,int64,uint32,uint64}`
 `Rint`                                | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
index d4b7621ad2858fe17e93d292dd807e4f7c1c336b..b9bdb829d773825005a8921f48d28b6892d8f0cd 100644
--- a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -6,6 +6,9 @@ Operator                              | Type Constraint
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
 `All`                                 | `Tidx={int32,int64}`
 `Angle`                               | `Tout={double,float}`<br>`T={complex64}`
 `Any`                                 | `Tidx={int32,int64}`
@@ -34,7 +37,7 @@ Operator                              | Type Constraint
 `BroadcastGradientArgs`               | `T={int32,int64}`
 `Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Ceil`                                | `T={double,float}`
-`Cholesky`                            | `T={complex64,double,float}`
+`Cholesky`                            | `T={double,float}`
 `Complex`                             | `Tout={complex64}`<br>`T={double,float}`
 `ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
 `Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -68,7 +71,11 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
-`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
 `FloorMod`                            | `T={double,float,int32,int64}`
@@ -80,6 +87,13 @@ Operator                              | Type Constraint
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
 `Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Imag`                                | `Tout={double,float}`<br>`T={complex64}`
@@ -105,11 +119,14 @@ Operator                              | Type Constraint
 `MatMul`                              | `T={complex64,double,float}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
 `Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -131,6 +148,10 @@ Operator                              | Type Constraint
 `PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
 `Range`                               | `Tidx={double,float,int32,int64}`
 `Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -143,6 +164,8 @@ Operator                              | Type Constraint
 `Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
 `ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
 `Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
 `ResourceApplyAdagrad`                | `T={double,float}`
 `ResourceApplyAdam`                   | `T={double,float}`
 `ResourceApplyFtrl`                   | `T={double,float}`
@@ -153,6 +176,7 @@ Operator                              | Type Constraint
 `ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
 `RightShift`                          | `T={int32,int64,uint32,uint64}`
 `Rint`                                | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 5e1b01878b74f2fbc2e84f8c2db1fa37c2c1eb0e..e9be6f8476d8a0e67aa7bd90db1a10466aa8e781 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -31,6 +31,7 @@ tf_kernel_library(
         "diag_op.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
+        "extract_image_patches_op.cc",
         "fft_ops.cc",
         "fill_op.cc",
         "function_ops.cc",
@@ -43,6 +44,9 @@ tf_kernel_library(
         "l2loss_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
+        "matrix_band_part_op.cc",
+        "matrix_set_diag_op.cc",
+        "matrix_triangular_solve_op.cc",
         "mirror_pad_op.cc",
         "no_op.cc",
         "one_hot_op.cc",
@@ -58,6 +62,7 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "reverse_sequence_op.cc",
         "scan_ops.cc",
         "segment_reduction_ops.cc",
         "select_op.cc",
@@ -92,6 +97,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
+        "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
         "//tensorflow/compiler/xla:array4d",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index a015b8e0e8949f8aaa03a78b0f88b7ea8d6aaa1c..b0ba25b9983c3a9af26728ce4b1c263c844327db 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -28,8 +28,9 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result =
-        BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), adj_x_, adj_y_);
+    auto result = BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1),
+                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
+                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
     OP_REQUIRES_OK(ctx, result.status());
     ctx->SetOutput(0, result.ValueOrDie());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index 87d858f763560be454c162e0cf40307c68217663..fe6651793dc763d13f4a4b0ac294ec3ecf64af8f 100644
--- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -33,7 +33,7 @@ class CholeskyOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Cholesky"), CholeskyOp);
+REGISTER_XLA_OP(Name("Cholesky").TypeConstraint("T", kFloatTypes), CholeskyOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2970eae20a3fb71f06619f476a49d41b22bca56
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -0,0 +1,169 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+class ExtractImagePatchesOp : public XlaOpKernel {
+ public:
+  explicit ExtractImagePatchesOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksizes", &ksizes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("rates", &dilations_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorFormat data_format = FORMAT_NHWC;
+    const int num_dims = ksizes_.size();
+
+    OP_REQUIRES(
+        ctx, num_dims >= 3,
+        errors::InvalidArgument("Kernel size must have at least 3 dimensions"));
+    const int num_spatial_dims = num_dims - 2;
+
+    OP_REQUIRES(ctx, strides_.size() == num_dims,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+    OP_REQUIRES(ctx, dilations_.size() == num_dims,
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+
+    int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
+    int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format);
+    OP_REQUIRES(
+        ctx, ksizes_[batch_dim] == 1 && ksizes_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "kernel sizes > 1 in the batch and depth "
+                              "dimensions."));
+    OP_REQUIRES(
+        ctx, strides_[batch_dim] == 1 && strides_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not support "
+                              "dilations in the batch and depth dimensions."));
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      OP_REQUIRES(
+          ctx, ksizes_[input_dim] >= 0,
+          errors::Unimplemented("Kernel size values must be non-negative; ", i,
+                                "th spatial dimension had dilation ",
+                                dilations_[input_dim]));
+      OP_REQUIRES(ctx, strides_[input_dim] >= 1,
+                  errors::Unimplemented("Stride values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+      OP_REQUIRES(ctx, dilations_[input_dim] >= 1,
+                  errors::Unimplemented("Dilation values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+    }
+
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(0), &type));
+
+    const TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == num_dims,
+        errors::InvalidArgument("input must be ", num_dims, "-dimensional",
+                                input_shape.DebugString()));
+    const int64 depth = input_shape.dim_size(feature_dim);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    // The following code is equivalent to:
+    // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD])
+    int64 kernel_size = 1;
+    std::vector<int64> lhs_shape(num_dims, 1);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      lhs_shape[i] = ksizes_[input_dim];
+      kernel_size *= ksizes_[input_dim];
+    }
+    lhs_shape[num_spatial_dims] = depth;
+    lhs_shape[num_spatial_dims + 1] = 1;
+
+    // Builds an identity matrix as a broadcast equality of iotas.
+    // iota = np.arange(np.prod(ksize), depth)
+    // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32)
+    xla::ComputationDataHandle iota;
+    TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
+                                 kernel_size * depth, &iota));
+
+    auto lhs = builder->Reshape(iota, lhs_shape);
+    auto filter = builder->ConvertElementType(
+        builder->Eq(lhs, iota, {num_spatial_dims + 1}), type);
+
+    xla::ConvolutionDimensionNumbers dims;
+    std::vector<int64> window_strides(num_spatial_dims);
+    std::vector<int64> lhs_dilation(num_spatial_dims, 1);
+    std::vector<int64> rhs_dilation(num_spatial_dims);
+    std::vector<std::pair<int64, int64>> padding(num_spatial_dims);
+
+    dims.set_input_batch_dimension(batch_dim);
+    dims.set_output_batch_dimension(batch_dim);
+    dims.set_input_feature_dimension(feature_dim);
+    dims.set_output_feature_dimension(feature_dim);
+    dims.set_kernel_input_feature_dimension(num_spatial_dims);
+    dims.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      const int64 dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      dims.add_input_spatial_dimensions(dim);
+      dims.add_kernel_spatial_dimensions(i);
+      dims.add_output_spatial_dimensions(dim);
+      window_strides[i] = strides_.at(dim);
+      rhs_dilation[i] = dilations_.at(dim);
+
+      int64 unused_output_size;
+      OP_REQUIRES_OK(
+          ctx, GetWindowedOutputSizeVerboseV2(
+                   input_shape.dim_size(dim), ksizes_[dim], rhs_dilation[i],
+                   window_strides[i], padding_, &unused_output_size,
+                   &padding[i].first, &padding[i].second));
+    }
+
+    xla::ComputationDataHandle conv =
+        builder->ConvGeneralDilated(ctx->Input(0), filter, window_strides,
+                                    padding, lhs_dilation, rhs_dilation, dims);
+    ctx->SetOutput(0, conv);
+  }
+
+ protected:
+  std::vector<int32> ksizes_;
+  std::vector<int32> dilations_;
+  std::vector<int32> strides_;
+  Padding padding_;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp);
+};
+
+REGISTER_XLA_OP(Name("ExtractImagePatches"), ExtractImagePatchesOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..faa415a97b053b4b11d015fefcd430210b98118a
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixBandPartOp : public XlaOpKernel {
+ public:
+  explicit MatrixBandPartOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    const TensorShape num_lower_in_shape = context->InputShape(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in_shape),
+                errors::InvalidArgument("num_lower must be scalar, got shape ",
+                                        num_lower_in_shape.DebugString()));
+
+    const TensorShape num_upper_in_shape = context->InputShape(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in_shape),
+                errors::InvalidArgument("num_upper must be scalar, got shape ",
+                                        num_upper_in_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle num_lower = context->Input(1);
+    xla::ComputationDataHandle num_upper = context->Input(2);
+    DataType input_type = context->input_type(0);
+    DataType index_type = context->input_type(1);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+    const int64 m = input_shape.dim_size(input_shape.dims() - 2);
+    const int64 n = input_shape.dim_size(input_shape.dims() - 1);
+
+    // Compute 'offset', which is how many diagonals we are above/below the
+    // diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, m, &iota_m));
+
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, n, &iota_n));
+
+    auto offset = builder->Sub(builder->Broadcast(iota_n, {m}), iota_m,
+                               /*broadcast_dimensions=*/{0});
+
+    // If num_lower or num_upper are negative, include all lower/upper
+    // diagonals.
+    auto zero_index = XlaHelpers::Zero(builder, index_type);
+    num_lower = builder->Select(
+        builder->Lt(num_lower, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, m), num_lower);
+    num_upper = builder->Select(
+        builder->Lt(num_upper, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, n), num_upper);
+
+    auto indicator = builder->And(builder->Le(builder->Neg(num_lower), offset),
+                                  builder->Le(offset, num_upper));
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    auto zero_input = XlaHelpers::Zero(builder, input_type);
+    auto output = builder->Select(
+        indicator, input,
+        builder->Broadcast(zero_input, input_shape.dim_sizes()));
+
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixBandPartOp);
+};
+REGISTER_XLA_OP(Name("MatrixBandPart"), MatrixBandPartOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2940bdcff75a087c914fdad0cb2426276e41aff
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+class MatrixSetDiagOp : public XlaOpKernel {
+ public:
+  explicit MatrixSetDiagOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape diag_shape = context->InputShape(1);
+
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    // Check to make sure the last dimension of diag is equal to the smaller of
+    // the last two dimensions of input.
+    const int64 m = input_shape.dim_size(rank - 2);
+    const int64 n = input_shape.dim_size(rank - 1);
+    const int64 min_dim = std::min(m, n);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+
+    TensorShape expected_diag_shape = batch_shape;
+    expected_diag_shape.AddDim(min_dim);
+    OP_REQUIRES(context, expected_diag_shape == diag_shape,
+                errors::InvalidArgument(
+                    "must have diagonal.shape == input.shape[:-2] + "
+                    "min(input.shape[-2:]), but received input shape: ",
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle diag = context->Input(1);
+
+    auto zero = XlaHelpers::Zero(builder, context->input_type(0));
+
+    // Create an indicator tensor that is true only on the diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, m, &iota_m));
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, n, &iota_n));
+    auto indicator = builder->Eq(iota_m,
+                                 builder->Broadcast(iota_n, {m}),
+                                 /*broadcast_dimensions=*/{0});
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    // Broadcast diag up to the input shape. Use an implicit broadcast (Add)
+    // because we need to broadcast on the right.
+    std::vector<int64> diag_broadcast_dims(rank - 1);
+    std::iota(diag_broadcast_dims.begin(), diag_broadcast_dims.end(), 0);
+    if (min_dim != m) {
+      diag_broadcast_dims.back() = rank - 1;
+    }
+    diag = builder->Add(diag, builder->Broadcast(zero, input_shape.dim_sizes()),
+                        /*broadcast_dimensions=*/diag_broadcast_dims);
+
+    auto output = builder->Select(indicator, diag, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
+};
+
+REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eaed93146460de5a6e8328432302cc75bf36a534
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixTriangularSolveOp : public XlaOpKernel {
+ public:
+  explicit MatrixTriangularSolveOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint", &adjoint_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result = TriangularSolve(
+        ctx->builder(), ctx->Input(0), ctx->Input(1), /*left_side=*/true,
+        /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
+    if (!result.ok()) {
+      ctx->SetStatus(result.status());
+      return;
+    }
+    ctx->SetOutput(0, result.ValueOrDie());
+  }
+
+ private:
+  bool lower_;
+  bool adjoint_;
+};
+
+REGISTER_XLA_OP(Name("MatrixTriangularSolve"), MatrixTriangularSolveOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 0b5a38967aeb5b4cd66de5220e2c764371440c2d..d4fb5dd4e06c7c70591262c0d63a91c383a2a6e0 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -37,21 +37,23 @@ class PoolingOp : public XlaOpKernel {
  public:
   PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    std::vector<int32> ksize_int;
-    std::vector<int32> stride_int;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
-    OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
-    OP_REQUIRES(ctx, stride_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    for (int i = 0; i < num_dims(); ++i) {
-      ksize_.push_back(ksize_int[i]);
-      stride_.push_back(stride_int[i]);
+    if (ctx->num_inputs() == 1) {
+      std::vector<int32> ksize_int;
+      std::vector<int32> stride_int;
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
+      OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
+      OP_REQUIRES(ctx, stride_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      for (int i = 0; i < num_dims(); ++i) {
+        ksize_.push_back(ksize_int[i]);
+        stride_.push_back(stride_int[i]);
+      }
     }
     Padding padding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding));
@@ -77,6 +79,33 @@ class PoolingOp : public XlaOpKernel {
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
 
+    std::vector<int64> ksize = ksize_;
+    std::vector<int64> stride = stride_;
+    if (ctx->num_inputs() != 1) {
+      const TensorShape ksize_shape = ctx->InputShape(1);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      ksize.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize));
+
+      const TensorShape stride_shape = ctx->InputShape(2);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      stride.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride));
+    }
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
@@ -84,8 +113,8 @@ class PoolingOp : public XlaOpKernel {
 
     const DataType type = input_type(0);
     xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
-        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize_,
-        stride_, padding_);
+        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize,
+        stride, padding_);
     ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape));
   }
 
@@ -130,6 +159,10 @@ class MaxPool2DOp : public MaxPoolOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
+REGISTER_XLA_OP(Name("MaxPoolV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DOp);
 
 class MaxPool3DOp : public MaxPoolOp {
  public:
@@ -243,22 +276,44 @@ class MaxPoolGradOp : public XlaOpKernel {
  public:
   MaxPoolGradOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+    if (ctx->num_inputs() == 3) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
+    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    if (ctx->num_inputs() != 3) {
+      OP_REQUIRES(
+          ctx, ctx->num_inputs() == 5,
+          errors::InvalidArgument("Must supply ksize and stride arguments."));
+      const TensorShape ksize_shape = ctx->InputShape(3);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(3, &ksize_));
+
+      const TensorShape stride_shape = ctx->InputShape(4);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(4, &stride_));
+    }
+
     OP_REQUIRES(ctx, ksize_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
     OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
-  }
 
-  int num_dims() const { return num_spatial_dims_ + 2; }
-
-  void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape tensor_in_shape = ctx->InputShape(0);
     const TensorShape tensor_out_shape = ctx->InputShape(1);
     const TensorShape out_backprop_shape = ctx->InputShape(2);
@@ -315,6 +370,10 @@ class MaxPool2DGradOp : public MaxPoolGradOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPoolGrad"), MaxPool2DGradOp);
+REGISTER_XLA_OP(Name("MaxPoolGradV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DGradOp);
 
 class MaxPool3DGradOp : public MaxPoolGradOp {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc5d3adb091cd238974c5b69b7a2f8fe639cc68
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class ReverseSequenceOp : public XlaOpKernel {
+ public:
+  explicit ReverseSequenceOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("batch_dim", &batch_dim_));
+    OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape seq_lens_shape = context->InputShape(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens_shape),
+                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
+                                        seq_lens_shape.dims()));
+    OP_REQUIRES(context, batch_dim_ != seq_dim_,
+                errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim_));
+    OP_REQUIRES(
+        context, seq_dim_ < input_shape.dims(),
+        errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+                                seq_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context, batch_dim_ < input_shape.dims(),
+        errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+                                batch_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context,
+        seq_lens_shape.num_elements() == input_shape.dim_size(batch_dim_),
+        errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim_,
+                                "), ", "(", seq_lens_shape.num_elements(),
+                                " vs. ", input_shape.dim_size(batch_dim_)));
+
+    xla::ComputationBuilder* builder = context->builder();
+    const auto input = context->Input(0);
+    const auto seq_lens = context->Input(1);
+
+    const int64 batch_size = input_shape.dim_size(batch_dim_);
+
+    const DataType input_type = context->input_type(0);
+    const DataType seq_lens_type = context->input_type(1);
+    const int64 max_seq_len = input_shape.dim_size(seq_dim_);
+
+    xla::Shape input_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(input_type, input_shape,
+                                                  &input_xla_shape));
+    xla::Shape seq_lens_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(seq_lens_type, seq_lens_shape,
+                                                  &seq_lens_xla_shape));
+
+    const auto tuple_shape = xla::ShapeUtil::MakeTupleShape({
+        xla::ShapeUtil::MakeShape(seq_lens_xla_shape.element_type(), {}),
+        seq_lens_xla_shape,
+        input_xla_shape,
+    });
+
+    // For each entry in the batch, reverse the sequence.
+    // TODO(b/65689298): generalize the Map() operator to non-scalar cases and
+    // use it here, instead of a While loop.
+
+    // Condition: lambda (i, _, _): i < batch_size
+    auto condition_builder =
+        builder->CreateSubBuilder("reverse_sequence_condition");
+    {
+      auto param = condition_builder->Parameter(0, tuple_shape, "param");
+      auto i = condition_builder->GetTupleElement(param, 0);
+      condition_builder->Lt(
+          i, XlaHelpers::IntegerLiteral(condition_builder.get(), seq_lens_type,
+                                        batch_size));
+    }
+    auto condition = condition_builder->Build();
+    OP_REQUIRES_OK(context, condition.status());
+
+    auto body_builder = builder->CreateSubBuilder("reverse_sequence_body");
+    {
+      auto param = body_builder->Parameter(0, tuple_shape, "param");
+      auto i = body_builder->GetTupleElement(param, 0);
+      auto seq_lens = body_builder->GetTupleElement(param, 1);
+      auto output = body_builder->GetTupleElement(param, 2);
+
+      // seq_len is the sequence length of the current batch element (rank 1)
+      auto seq_len = body_builder->DynamicSlice(
+          seq_lens, body_builder->Reshape(i, {1}), {1});
+
+      // Indices is the offset of the batch element in the input.
+      auto indices = body_builder->Broadcast(
+          XlaHelpers::Zero(body_builder.get(), seq_lens_type),
+          {input_shape.dims()});
+      indices = body_builder->DynamicUpdateSlice(
+          indices, body_builder->Reshape(i, {1}),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         batch_dim_),
+              {1}));
+
+      // slice_indices is the offset of the start of the reversed sequence in
+      // the input.
+      auto slice_indices = body_builder->DynamicUpdateSlice(
+          indices,
+          body_builder->Sub(XlaHelpers::IntegerLiteral(
+                                body_builder.get(), seq_lens_type, max_seq_len),
+                            seq_len),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         seq_dim_),
+              {1}));
+
+      // Slice out the reversed sequence. The slice will overflow the end of the
+      // sequence, and the contents of the overflow are implementation-defined.
+      // However, we will mask off these elements and replace them with elements
+      // from the original input so their values do not matter.
+      TensorShape slice_shape = input_shape;
+      slice_shape.set_dim(batch_dim_, 1);
+      auto slice = body_builder->DynamicSlice(output, slice_indices,
+                                              slice_shape.dim_sizes());
+
+      // Shift the reversed sequence to the left.
+      output = body_builder->DynamicUpdateSlice(output, slice, indices);
+
+      body_builder->Tuple(
+          {body_builder->Add(
+               i, XlaHelpers::One(body_builder.get(), seq_lens_type)),
+           seq_lens, output});
+    }
+    auto body = body_builder->Build();
+    OP_REQUIRES_OK(context, body.status());
+
+    auto loop_output = builder->While(
+        condition.ValueOrDie(), body.ValueOrDie(),
+        builder->Tuple({XlaHelpers::Zero(builder, seq_lens_type), seq_lens,
+                        builder->Rev(input, {seq_dim_})}));
+    auto output = builder->GetTupleElement(loop_output, 2);
+
+    // Mask out elements after the sequence length.
+    xla::ComputationDataHandle iota;
+    OP_REQUIRES_OK(
+        context, XlaHelpers::Iota(builder, seq_lens_type, max_seq_len, &iota));
+    std::vector<int64> dims(input_shape.dims(), 1);
+    dims[batch_dim_] = batch_size;
+    auto mask = builder->Lt(iota, builder->Reshape(seq_lens, dims), {seq_dim_});
+
+    // Broadcast the mask up to the input shape.
+    mask =
+        builder->Or(mask, builder->Broadcast(builder->ConstantR0<bool>(false),
+                                             input_shape.dim_sizes()));
+
+    output = builder->Select(mask, output, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  int32 batch_dim_;
+  int32 seq_dim_;
+};
+
+REGISTER_XLA_OP(Name("ReverseSequence"), ReverseSequenceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index a266e9013c41b88788dbc99849f01c09f3d61348..0c5ad9e5255ffc3dfcfb83335060ae833937b3ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -50,18 +50,41 @@ XLAJIT_MAKE_UNARY(Conj, b->Conj(x));
 // Return x if x>0, otherwise -x.
 XLAJIT_MAKE_UNARY(Abs, b->Abs(x));
 
+// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
+XLAJIT_MAKE_UNARY(
+    Acos,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                  b->Mul(x, x)),
+                           XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
+                    b->Add(XlaHelpers::One(b, input_type(0)), x))));
+
 // acosh(x) = log(x + sqrt(x^2 - 1))
 XLAJIT_MAKE_UNARY(
     Acosh,
     b->Log(b->Add(x, b->Pow(b->Sub(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+// asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
+XLAJIT_MAKE_UNARY(
+    Asin,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(x, b->Add(XlaHelpers::One(b, input_type(0)),
+                              b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                            b->Mul(x, x)),
+                                     XlaHelpers::FloatLiteral(b, input_type(0),
+                                                              0.5))))));
+
 // asinh(x) = log(x + sqrt(x^2 + 1))
 XLAJIT_MAKE_UNARY(
     Asinh,
     b->Log(b->Add(x, b->Pow(b->Add(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+XLAJIT_MAKE_UNARY(Atan, b->Atan2(x, XlaHelpers::One(b, input_type(0))));
+
 // atanh(x) = 0.5 * log((1 + x) / (1 - x))
 XLAJIT_MAKE_UNARY(
     Atanh, b->Mul(b->Log(b->Div(b->Add(XlaHelpers::One(b, input_type(0)), x),
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 21ad21f73737a289390ed1ea767db1078d05b466..d184f59e01788829d0ba97092c14d36e5188e4e8 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -60,6 +60,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 9b0e6174475c22e325c090bec5f1d56822e106bc..798f0fa78055e800038e8bf41b4f410b670be7dd 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -25,11 +25,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The current implementation simply unrolls the computation along the batch
-// dimension.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) {
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x, bool conjugate_y) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> x_shape,
                       builder->GetShape(x));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> y_shape,
@@ -89,10 +88,10 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
         dimensions);
   }
 
-  if (x_shape->element_type() == xla::C64 && transpose_x) {
+  if (x_shape->element_type() == xla::C64 && conjugate_x) {
     x = builder->Conj(x);
   }
-  if (y_shape->element_type() == xla::C64 && transpose_y) {
+  if (y_shape->element_type() == xla::C64 && conjugate_y) {
     y = builder->Conj(y);
   }
 
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index b46bc7417d29dc5b7e9649ac28cc78b57d4b619c..b230e885f10f45a78cdd6e455da3ba55ce589b96 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -27,7 +27,10 @@ namespace tensorflow {
 // viewed as an element of a batch), and arranges the individual results
 // in a single output tensor of the same batch size. Each of the
 // individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`.
+// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
+// can be elementwise-complex-conjugated by setting the `conjugate_x` or
+// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
+// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
 //
 // The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
 // and `[..., r_y, c_y]`.
@@ -40,11 +43,10 @@ namespace tensorflow {
 // It is computed as:
 //
 //     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-// TODO(phawkins): add an option to take the complex conjugate of the LHS or
-// RHS.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y);
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x = false, bool conjugate_y = false);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index b3cc489adf6042acb3f56b3a0a6c8fbe43bde629..e795701181dd80a2ff544743d513bffd52fd2399 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -71,11 +71,14 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
                           SliceInMinorDims(builder, l, {j + 1, 0}, {n, j}));
       TF_ASSIGN_OR_RETURN(auto r_squared,
                           BatchDot(builder, r, r, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       new_d_squared = builder->Sub(new_d_squared, r_squared);
 
       TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false,
-                                       /*transpose_y=*/true));
+                                       /*transpose_y=*/true,
+                                       /*conjugate_x=*/false,
+                                       /*conjugate_y=*/false));
     }
     auto new_d_inv = builder->Pow(
         new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5));
@@ -134,7 +137,8 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, l, {i, 0}, {i + k, i}));
       TF_ASSIGN_OR_RETURN(auto delta,
                           BatchDot(builder, lhs, rhs, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       TF_ASSIGN_OR_RETURN(auto before,
                           SliceInMinorDims(builder, a, {i, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(
@@ -155,6 +159,10 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(auto update,
                           TriangularSolve(builder, factorized, panel,
+                                          /*left_side=*/false,
+                                          /*lower=*/true,
+                                          /*transpose_a=*/true,
+                                          /*conjugate_a=*/false,
                                           /*block_size=*/8));
       TF_ASSIGN_OR_RETURN(
           l, UpdateSliceInMinorDims(builder, l, update, {i + k, i}));
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 2bead7359baaf3582c1230adf0cd4a90046859d2..e083a383be4be0d1b556b63214fe5f70323b4149 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -29,6 +29,7 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
+// TODO(mattjj): handle the complex Hermitian case
 xla::StatusOr<xla::ComputationDataHandle> Cholesky(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
     int64 block_size = 256);
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 579944c3a381e7018b7fee5013d0509158ce21cc..7f72a6073df218b9e2bd4cc0c0b5bb10b5cd4b84 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -24,13 +24,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size) {
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
                       builder->GetShape(a));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
@@ -60,14 +62,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     batch_dimensions.push_back(a_size);
   }
 
-  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
-  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
-  if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+  if (xla::ShapeUtil::GetDimension(*a_shape, -1) !=
+      xla::ShapeUtil::GetDimension(*a_shape, -2)) {
     return errors::InvalidArgument(
         "The 'a' arguments to TriangularSolve must be square matrices: ",
         xla::ShapeUtil::HumanString(*a_shape));
   }
-  if (n != xla::ShapeUtil::GetDimension(*b_shape, -1)) {
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(*a_shape, -1)) {
     return errors::InvalidArgument(
         "Arguments to TriangularSolve have incompatible matrix shapes: ",
         xla::ShapeUtil::HumanString(*a_shape), " vs ",
@@ -89,6 +92,14 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     return output;
   };
 
+  // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
+  // is true, otherwise returns its argument.
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
   std::map<int, xla::Computation> base_computations;
   auto get_base_triangular_solve =
       [&](int k) -> xla::StatusOr<xla::Computation*> {
@@ -103,19 +114,35 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                                                    prepend_batch_dims({k, k})),
                          "a");
 
+      std::array<int64, 2> b_lastd;
+      if (left_side) {
+        b_lastd = {k, n};
+      } else {
+        b_lastd = {m, k};
+      }
       auto b_param =
           sub->Parameter(1,
                          xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims({m, k})),
+                                                   prepend_batch_dims(b_lastd)),
                          "b");
 
-      // TODO(phawkins): it might make sense to use a while loop here, rather
-      // than unrolling.
-      // TODO(phawkins): the left-looking variant of the algorithm might be more
-      // efficient at block size 1.
-      TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
-                                         /*block_size=*/1)
-                             .status());
+      // We use a left-looking subroutine on the block diagonal in some common
+      // cases, while falling back to a recursive call in unsupported cases. The
+      // left-looking subroutine is written with a While loop and so yields much
+      // faster compile times. Moreover, the left-looking variant can give
+      // higher performance on smaller (sub)problems.
+      if (left_side && lower) {
+        TF_RETURN_IF_ERROR(TriangularSolveLeftLooking(sub.get(), a_param,
+                                                      b_param, transpose_a,
+                                                      conjugate_a)
+                               .status());
+      } else {
+        TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
+                                           left_side, lower, transpose_a,
+                                           conjugate_a,
+                                           /*block_size=*/1)
+                               .status());
+      }
 
       TF_ASSIGN_OR_RETURN(computation, sub->Build());
     }
@@ -129,47 +156,396 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
   // Goto, Kazushige, and Robert Van De Geijn. "High-performance implementation
   // of the level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1
   // (2008): 4.
-  for (int64 i = 0; i < n; i += block_size) {
-    int64 k = std::min(block_size, n - i);
 
-    // if k > 1:
-    //   output[..., :, i:i+k] = triangular_solve(
-    //       a[..., i:i+k, ..., i:i+k], b[..., :, i:i+k], side='Right',
-    //       kind='Lower', transpose=True, block_size=1)
-    // else:
-    //   output[..., :, i] = b[..., :, i] / a[..., i, i]
+  // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if
+  // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if
+  // conjugate_a is True.
+
+  if (!left_side && lower == transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < n; i += block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, i+k:] -= np.matmul(output[..., :, i:i+k], a_slice_2)
+      if (i + k < n) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, n}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+      }
+    }
+
+  } else if (left_side && lower != transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < m; i += block_size) {
+      int64 k = std::min(block_size, m - i);
+
+      // output[..., i:i+k, :] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., i+k:, :] -= np.matmul(a_slice_2, output[..., i:i+k, :])
+      if (i + k < m) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {m, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, m}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {i + k, 0}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {i + k, 0}));
+      }
+    }
+  } else if (!left_side && lower != transpose_a) {
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(n, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, :i] -= np.matmul(out[..., :, i:i+k], a_slice_2)
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {m, i}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
+    }
+  } else {  // left_side && lower == transpose_a
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(m, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, m - i);
+
+      // output[..., i:i+k, :] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :i, :] -= np.matmul(a_slice_2, out[..., i:i+k, :])
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {i, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
+    }
+  }
+
+  return output;
+}
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
+                      builder->GetShape(b));
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  const int64 ndims = xla::ShapeUtil::Rank(*a_shape);
+
+  std::vector<int64> batch_dimensions;
+  for (int i = 0; i < ndims - 2; ++i) {
+    int64 a_size = a_shape->dimensions(i);
+    batch_dimensions.push_back(a_size);
+  }
+
+  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
+    std::vector<int64> output(ndims);
+    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
+    std::copy(indices.begin(), indices.end(),
+              output.begin() + batch_dimensions.size());
+    return output;
+  };
+
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
+  // The main computation is performed in a While loop.
+
+  // Allocate the output and set its first or last row,
+  // output = np.zeros_like(b)
+  // if transpose_a:
+  //   output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:]
+  // else:
+  //   output[..., :1, :] = b[..., :1, :] / a[..., :1, :1]
+  xla::ComputationDataHandle output = Zeros(builder, *b_shape);
+  {
+    auto i = transpose_a ? m - 1 : 0;
     TF_ASSIGN_OR_RETURN(auto a_slice,
-                        SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+                        SliceInMinorDims(builder, a, {i, i}, {i + 1, i + 1}));
     TF_ASSIGN_OR_RETURN(auto b_slice,
-                        SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
-    xla::ComputationDataHandle update;
-    if (k > 1) {
-      TF_ASSIGN_OR_RETURN(xla::Computation * solve,
-                          get_base_triangular_solve(k));
-      update = builder->Call(*solve, {a_slice, b_slice});
+                        SliceInMinorDims(builder, b, {i, 0}, {i + 1, n}));
+    auto update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+    TF_ASSIGN_OR_RETURN(
+        output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+  }
+
+  // Construct the initial loop carry tuple,
+  // if transpose_a:
+  //   init = (m-2, output, a, b)
+  // else:
+  //   init = (1, output, a, b)
+  std::vector<xla::Shape> tuple_shapes = {
+      // The loop iteration counter is a scalar, incremented each iteration.
+      xla::ShapeUtil::MakeShape(xla::S32, {}),
+      // The output has the shape of b, with one row updated each iteration.
+      *b_shape,
+      // The coefficient matrix a is a loop invariant.
+      *a_shape,
+      // The right-hand-side matrix b is a loop invariant.
+      *b_shape};
+  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+  auto init_i = builder->ConstantR0<int32>(transpose_a ? m - 2 : 1);
+  auto init = builder->Tuple({init_i, output, a, b});
+
+  // Construct the loop condition function,
+  // def cond_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   return i >= 0 if transpose_a else i < m
+  std::unique_ptr<xla::ComputationBuilder> condb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond");
+  {
+    auto i = condb->GetTupleElement(
+        condb->Parameter(0, tuple_shape,
+                         "TriangularSolveLeftLookingWhileTuple"),
+        0);
+    if (transpose_a) {
+      condb->Ge(i, condb->ConstantR0<int32>(0));
     } else {
-      update = builder->Div(b_slice, a_slice);
+      condb->Lt(i, condb->ConstantR0<int32>(m));
     }
+  }
+  TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
 
-    TF_ASSIGN_OR_RETURN(
-        output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
-    // b[..., :, i+k:] -= np.dot(output[..., :, i:i+k],
-    //                           np.transpose(..., a[i+k:, i:i+k]))
-    if (i + k < n) {
-      TF_ASSIGN_OR_RETURN(auto a_slice_2,
-                          SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
-      TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, update, a_slice_2,
-                                                  /*transpose_x=*/false,
-                                                  /*transpose_y=*/true));
-
-      TF_ASSIGN_OR_RETURN(auto b_slice_2,
-                          SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
-      b_update = builder->Sub(b_slice_2, b_update);
-      TF_ASSIGN_OR_RETURN(
-          b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+  // Construct the loop body function,
+  // def body_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   if transpose_a:
+  //     a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2)
+  //   else:
+  //     a_row = a[..., i:i+1, :i]
+  //   result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :])
+  //   output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+  //   if transpose_a:
+  //     return (i - 1, output, a, b)
+  //   else:
+  //     return (i + 1, output, a, b)
+  // We have to do some extra FLOPs propagating zeros in the matrix multiply
+  // because we can't have the size of its arguments depend on the loop counter.
+  std::unique_ptr<xla::ComputationBuilder> bodyb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody");
+  {
+    auto input_tuple = bodyb->Parameter(0, tuple_shape,
+                                        "TriangularSolveLeftLookingWhileTuple");
+
+    // i, output, a, b = loop_carry
+    auto i = bodyb->GetTupleElement(input_tuple, 0);
+    auto body_out = bodyb->GetTupleElement(input_tuple, 1);
+    auto body_a = bodyb->GetTupleElement(input_tuple, 2);
+    auto body_b = bodyb->GetTupleElement(input_tuple, 3);
+    auto zero = bodyb->ConstantR0<int32>(0);
+
+    // Set up some helper functions.
+    auto prepend_zeros = [&](std::array<xla::ComputationDataHandle, 2> starts) {
+      auto zero = bodyb->Reshape(bodyb->ConstantR0<int32>(0), {1});
+      std::vector<xla::ComputationDataHandle> padded_starts(ndims, zero);
+      padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1});
+      padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1});
+      return bodyb->ConcatInDim(padded_starts, 0);
+    };
+
+    auto dynamic_slice = [&](xla::ComputationDataHandle x,
+                             std::array<xla::ComputationDataHandle, 2> starts,
+                             std::array<int64, 2> sizes) {
+      auto padded_starts = prepend_zeros(starts);
+      auto padded_sizes = prepend_batch_dims(sizes);
+      return bodyb->DynamicSlice(x, padded_starts, padded_sizes);
+    };
+
+    auto update = [&](xla::ComputationDataHandle x,
+                      xla::ComputationDataHandle update,
+                      std::array<xla::ComputationDataHandle, 2> starts) {
+      auto padded_starts = prepend_zeros(starts);
+      return bodyb->DynamicUpdateSlice(x, update, padded_starts);
+    };
+
+    // We'd like to implement this:
+    //   if transpose_a:
+    //     a_row = T(a[..., i+1:, i:i+1])
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a_row, body_out[..., i+1:, :]))
+    //   else:
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :]))
+    // But since we can't have intermediate array sizes depend on the loop
+    // counter, we instead exploit the fact that we initialized the output to
+    // all zeros and use that as zero-padding (doing unnecessary FLOPs).
+    xla::ComputationDataHandle a_row;
+    if (transpose_a) {
+      a_row = dynamic_slice(body_a, {zero, i}, {m, 1});
+    } else {
+      a_row = dynamic_slice(body_a, {i, zero}, {1, m});
     }
+    TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out,
+                                                /*transpose_x=*/transpose_a,
+                                                /*transpose_y=*/false,
+                                                /*conjugate_x=*/conjugate_a,
+                                                /*conjugate_y=*/false));
+    auto result_row =
+        bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update);
+
+    // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+    auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1});
+    auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
+    body_out = update(body_out, div_result, {i, zero});
+
+    // if transpose_a:
+    //   return (i - 1, body_out, a, b)
+    // else:
+    //   return (i + 1, body_out, a, b)
+    auto next_i = bodyb->Add(i, bodyb->ConstantR0<int32>(transpose_a ? -1 : 1));
+    bodyb->Tuple({next_i, body_out, body_a, body_b});
   }
-  return output;
+  TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+  // Construct the While loop and return the result,
+  // return while_loop(cond_fun, body_fun, init)[1]
+  auto triangular_solve_left_looking_while = builder->While(cond, body, init);
+  return builder->GetTupleElement(triangular_solve_left_looking_while, 1);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index 501d026411c80359c7efa406ece5929a2e46ac1f..e32223bfdddda800b1fd4de3e4f0c8061e0f81d8 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -21,25 +21,50 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-// backsubstitution.
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+// That is, the innermost matrices in the output satisfy a scalar system
+// depending on the value of the value of (left_side, transpose_a, conjugate_a)
+// according to:
+//   (F, F, F) => `output[..., i, k]  a[..., k, j] = b[..., i, j]`,
+//   (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`,
+//   (F, T, F) => `output[..., i, k]  a[..., j, k] = b[..., i, j]`,
+//   (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`,
+//   (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`,
+//   (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`,
+// where * denotes complex conjugation and where the index `k` is summed over.
 //
-// `a` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. The strictly upper triangular part of each inner-most matrix
-// is assumed to be zero and not accessed.
-// `b` is a tensor of shape `[..., M, K]`.
-//
-// The innermost matrices in the output satisfy matrix equations
-// `output[..., i, j] * adjoint(a[..., k, j]) = b[..., i, k]`.
+// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+// square matrices. If lower is true (false), then the strictly upper (lower)
+// triangular part of each innermost matrix in `a` is assumed to be zero and is
+// not accessed.
+// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a
+// tensor of shape `[..., K, M]`.
+// `left_side` is a boolean, indicating whether to solve a system of the form
+// op(a) * x = b (true) or x * op(a) = b (false).
+// `lower` is a boolean, indicating whether the argument `a` is lower-triangular
+// (true) or upper-triangular (false).
+// `transpose_a` is a boolean indicating whether the matrix `a` is transposed.
+// `conjugate_a` is a boolean indicating whether the entries of `a` are complex
+// conjugated (independently of whether they are transposed), so that when both
+// transpose_a and conjugate_a are true the effect is a Hermitian adjoint.
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-// TODO(phawkins): equivalent to the BLAS TRSM routine with side=right,
-// kind=lower, and transposed_a=true. Implement the other possible combinations
-// of side, kind and transposed_a.
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size = 256);
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size = 256);
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
index 671d9aa4fe0c042a3cc44468074653d51c2be75d..661707062916263fd0d5d935ce41698a7655df02 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -27,32 +27,134 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
 
 using TriangularSolveTest = xla::ClientLibraryTestBase;
+using TriangularSolveLeftLookingTest = xla::ClientLibraryTestBase;
+using complex64 = xla::complex64;
 
-XLA_TEST_F(TriangularSolveTest, Simple) {
+xla::Array2D<float> AValsLower() {
+  return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+xla::Array2D<float> AValsUpper() {
+  return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}};
+}
+
+xla::Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<complex64> AValsLowerComplex() {
+  return {{2, 0, 0, 0},
+          {complex64(3, 1), 6, 0, 0},
+          {4, complex64(7, 2), 9, 0},
+          {5, 8, complex64(10, 3), 11}};
+}
+
+xla::Array2D<complex64> AValsUpperComplex() {
+  return {{2, 3, complex64(4, 3), 5},
+          {0, 6, complex64(7, 2), 8},
+          {0, 0, complex64(9, 1), 10},
+          {0, 0, 0, 11}};
+}
+
+xla::Array2D<complex64> BValsRightComplex() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<complex64> BValsLeftComplex() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<float> AValsFull() {
+  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
   xla::ComputationBuilder builder(client_, TestName());
 
-  xla::Array2D<float> a_vals({
-      {2, 0, 0, 0},
-      {3, 6, 0, 0},
-      {4, 7, 9, 0},
-      {5, 8, 10, 11},
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 0.08333334, 0.04629629, 0.03367003},
+      {2.5, -0.25, -0.1388889, -0.1010101},
+      {4.5, -0.58333331, -0.32407406, -0.23569024},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
-  xla::Array2D<float> b_vals({
-      {1, 2, 3, 4},
-      {5, 6, 7, 8},
-      {9, 10, 11, 12},
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
 
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
   xla::ComputationDataHandle a, b;
-  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(b_vals, 1, "b", &builder, &b);
-  auto result = TriangularSolve(&builder, a, b, /*block_size=*/2);
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
   TF_ASSERT_OK(result.status());
 
   xla::Array2D<float> expected({
@@ -62,7 +164,201 @@ XLA_TEST_F(TriangularSolveTest, Simple) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(2e-3, 2e-3));
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/true,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, complex64(0.08333333, 0.08333333),
+       complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)},
+      {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963),
+       complex64(0.08670034, -0.02104377)},
+      {4.5, complex64(-0.58333333, 0.75), complex64(-0.49074074, -0.71296296),
+       complex64(0.11026936, -0.03114478)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, 1., 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {complex64(0.20020325, -2.81504065e-01),
+       complex64(0.13821138, -4.22764228e-01),
+       complex64(0.07621951, -5.64024390e-01)},
+      {complex64(0.19678492, 2.55912786e-01),
+       complex64(0.17738359, 3.84331116e-01),
+       complex64(0.15798226, 5.12749446e-01)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index ce24b61b5dc7176f3caa05e3eb9257399fef7926..9b7492f8cf6e86498d7e2f5d42e42ea978c664d8 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -107,4 +107,15 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
   return UpdateSlice(builder, x, update, padded_start);
 }
 
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_RET_CHECK(n_dims >= 2);
+  std::vector<int64> permutation(n_dims);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+  return builder->Transpose(x, permutation);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index fb138b4f736500aac8184770d97fbf930ced69ea..7f93102ee78bec60018814975a0badfeb7874aa6 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -49,6 +49,10 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
 
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 79da701fd244a461a60588153b601d5c1870fa89..672e19bd93449ccc31f4af5ded23257b197a3c39 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -29,7 +29,7 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       arg_names_(static_data.arg_names),
       result_names_(static_data.result_names),
       program_shape_(static_data.program_shape),
-      hlo_profile_printer_(static_data.hlo_profile_printer) {
+      hlo_profile_printer_data_(static_data.hlo_profile_printer_data) {
   // Allocate arg and temp buffers.
   if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) {
     alloc_args_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers(
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index e0ae3ed9a811bcc49ce8862037a67d293e879e57..48a8c083cacf2f6ecf9dc1817b6174c01385d035 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -26,7 +26,7 @@ limitations under the License.
 // never use this functionality.
 namespace xla {
 class ProgramShape;
-class HloProfilePrinter;
+class HloProfilePrinterData;
 }
 
 namespace tensorflow {
@@ -77,12 +77,14 @@ class XlaCompiledCpuFunction {
     // [Optional] Arg and result shapes.
     const xla::ProgramShape* program_shape = nullptr;
 
-    // [Optional] Profile printer.  Null if profiling is disabled.
-    const xla::HloProfilePrinter* hlo_profile_printer = nullptr;
+    // [Optional] Profile printer data.  Null if profiling is disabled.
+    const xla::HloProfilePrinterData* hlo_profile_printer_data = nullptr;
 
     // [Optional] The number of profile counters expected in the profile counter
     // buffer by the generated code and hlo_profile_printer.  0 if profiling is
-    // disabled.
+    // disabled.  This information is already present in
+    // hlo_profile_printer_data but xla::HloProfilePrinterData is forward
+    // declared so we don't have access to that information here.
     int64 profile_counters_size = 0;
   };
 
@@ -205,10 +207,12 @@ class XlaCompiledCpuFunction {
   // program shape isn't available.
   const xla::ProgramShape* ProgramShape() const { return program_shape_; }
 
-  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
-  const xla::HloProfilePrinter& hlo_profile_printer() const {
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
+  const xla::HloProfilePrinterData& hlo_profile_printer_data() const {
     assert(hlo_profiling_enabled());
-    return *hlo_profile_printer_;
+    return *hlo_profile_printer_data_;
   }
 
  private:
@@ -234,7 +238,7 @@ class XlaCompiledCpuFunction {
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
   const xla::ProgramShape* program_shape_ = nullptr;
-  const xla::HloProfilePrinter* hlo_profile_printer_ = nullptr;
+  const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 6a46e54f61cb4dbb2a2c1916696655a4e3d85fff..30d3c05ee9aa33accc0ad122901f70b0b6613104 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -235,6 +235,19 @@ class XlaCompiler {
     // device is created, and can be used to create metadata objects
     // that can be accessed by XLA op kernels.
     std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
+
+    // If not nullptr, this memory allocator can be used by the compiler for
+    // temporary allocations it might want to make during compilation.
+    //
+    // For example, the compiler may want to try out different algorithms and
+    // choose the fastest one, and it might run those algorithms over buffers
+    // created using this allocator.
+    //
+    // The compiler can function correctly without an explicit allocator given
+    // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
+    // allocate most or all available memory on the device, leaving none for the
+    // compiler to access, unless it can use TensorFlow's allocator.
+    xla::DeviceMemoryAllocator* device_allocator = nullptr;
   };
 
   explicit XlaCompiler(Options options);
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 584417bc72c8f6645c05912e857b031cfb394e54..1fe6e69ff2dc838152032ac3d7b21de41684c6f6 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -182,10 +182,10 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.program_shape = jit->program_shape_.get();
 
   if (cpu_executable->hlo_profiling_enabled()) {
-    jit->static_data_.hlo_profile_printer =
-        &cpu_executable->hlo_profile_printer();
+    jit->static_data_.hlo_profile_printer_data =
+        &cpu_executable->hlo_profile_printer_data();
     jit->static_data_.profile_counters_size =
-        cpu_executable->hlo_profile_printer().profile_counters_size();
+        cpu_executable->hlo_profile_printer_data().profile_counters_size();
   }
 
   return std::move(jit_unique_ptr);
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 438f1443f17717a3806827abcb36d4ccbbbf756c..34e733bc8d80b364cec1783006eba0a5468b55ea 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -88,7 +88,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
@@ -182,6 +181,7 @@ cc_library(
     deps = [
         ":status",
         ":status_macros",
+        ":statusor",
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index d6b4ebfc39ae039ff27fe9fb8a3487c870832f3e..02356699a25e47be50eb15872df4c9c302fc289b 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -80,6 +80,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "executable_build_options",
+    srcs = ["executable_build_options.cc"],
+    hdrs = ["executable_build_options.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "local_client",
     srcs = ["local_client.cc"],
@@ -87,6 +99,7 @@ cc_library(
     deps = [
         ":client",
         ":computation",
+        ":executable_build_options",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -98,6 +111,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:source_map_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:support",
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index d82ba63e8ad0b9ceac0eb5f0cd7720cac0cbe6d3..ea4cdb76673b1c99036224bcd754ce4fe1360945 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -67,7 +67,7 @@ class ComputationBuilder {
   // OpMetadata is often applied to a series of XLA HLO instructions. As a
   // result, OpMetadata is set on the Computation Builder. All subsequent
   // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetdata.
+  // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
 
   // Clears the HloMetadata state.
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
new file mode 100644
index 0000000000000000000000000000000000000000..804e34f5e75ce2d153ac7627b94a543fda88e810
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
+    DeviceMemoryAllocator* allocator) {
+  device_allocator_ = allocator;
+  return *this;
+}
+
+DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+  return device_allocator_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
+    int device_ordinal) {
+  CHECK_GE(device_ordinal, 0);
+  device_ordinal_ = device_ordinal;
+  return *this;
+}
+
+int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
+    const Shape& shape_with_layout) {
+  result_layout_set_ = true;
+  result_layout_ = shape_with_layout;
+  return *this;
+}
+
+const Shape* ExecutableBuildOptions::result_layout() const {
+  return result_layout_set_ ? &result_layout_ : nullptr;
+}
+
+string ExecutableBuildOptions::ToString() const {
+  string result_layout = "nullopt";
+  if (result_layout_set_) {
+    result_layout = ShapeUtil::HumanStringWithLayout(result_layout_);
+  }
+  string generate_hlo_graph = "nullopt";
+  if (generate_hlo_graph_.has_value()) {
+    generate_hlo_graph = generate_hlo_graph_.value();
+  }
+  return tensorflow::strings::Printf(
+      "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
+      "generate_hlo_graph=%s}",
+      device_ordinal_, result_layout.c_str(), generate_hlo_graph.c_str());
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
+    string regex) {
+  generate_hlo_graph_ = std::move(regex);
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::generate_hlo_graph() const {
+  return generate_hlo_graph_;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a52dbac9adb155ad9a7d91a8102707f70fe2fbf
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+
+namespace xla {
+
+// Class containing options for building an LocalExecutable with
+// LocalClient::Compile.
+class ExecutableBuildOptions {
+ public:
+  // If set, this is the device to build the computation for. Valid
+  // device_ordinal values are: 0 to # of devices - 1. These values are
+  // identical to the device ordinal values used by StreamExecutor. The built
+  // executable will be executable on any device equivalent to the specified
+  // device as determined by Backend::devices_equivalent(). A value of -1
+  // indicates this option has not been set.
+  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
+  int device_ordinal() const;
+
+  // If set, this specifies the layout of the result of the computation. If not
+  // set, the service will chose the layout of the result. A Shape is used to
+  // store the layout to accommodate tuple result shapes. A value of nullptr
+  // indicates the option has not been set.
+  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
+  const Shape* result_layout() const;
+
+  // If set, this specifies an allocator that can be used to allocate temporary
+  // space on the device during compilation.  For example, the compiler might
+  // want to run various algorithms on the device and pick the fastest one -- it
+  // might allocate buffers for use by these algorithms using this allocator.
+  //
+  // This does not need to be the same as the DeviceMemoryAllocator passed when
+  // running the executable.
+  ExecutableBuildOptions& set_device_allocator(
+      DeviceMemoryAllocator* allocator);
+  DeviceMemoryAllocator* device_allocator() const;
+
+  // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
+  ExecutableBuildOptions& set_generate_hlo_graph(string regex);
+  const tensorflow::gtl::optional<string>& generate_hlo_graph() const;
+
+  // Returns a string representation of the build options, suitable for
+  // debugging.
+  string ToString() const;
+
+ private:
+  int device_ordinal_ = -1;
+  Shape result_layout_;
+  bool result_layout_set_ = false;
+  tensorflow::gtl::optional<string> generate_hlo_graph_;
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 523169fdd266d445c9d0d056ba20091f77610ad9..ef98dbb6403beedb0c08ab9a0fc9e7d4ee31ab3b 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -21,30 +21,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace se = ::perftools::gputools;
 
-namespace xla {
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
-    int device_ordinal) {
-  device_ordinal_ = device_ordinal;
-  return *this;
-}
-
-int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
+using xla::source_map_util::InvalidParameterArgument;
 
-ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
-    const Shape& shape_with_layout) {
-  result_layout_set_ = true;
-  result_layout_ = shape_with_layout;
-  return *this;
-}
-
-const Shape* ExecutableBuildOptions::result_layout() const {
-  return result_layout_set_ ? &result_layout_ : nullptr;
-}
+namespace xla {
 
 namespace {
 StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
@@ -57,16 +41,18 @@ StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
 }  // namespace
 
 LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
-                                 Backend* backend, int device_ordinal,
-                                 const ExecutableBuildOptions& build_options)
+                                 Backend* backend,
+                                 ExecutableBuildOptions build_options)
     : executable_(std::move(executable)),
       backend_(backend),
-      build_device_ordinal_(device_ordinal),
-      build_options_(build_options) {}
+      build_options_(std::move(build_options)) {
+  CHECK_GE(build_options_.device_ordinal(), 0)
+      << "Must have a valid device ordinal that the executable was built for.";
+}
 
 tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options, const Backend& backend) {
+    const ExecutableRunOptions& run_options, const Backend& backend) {
   const ComputationLayout& computation_layout =
       executable_->module_config().entry_computation_layout();
 
@@ -79,9 +65,10 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
   for (int i = 0; i < arguments.size(); ++i) {
     if (!computation_layout.parameter_layout(i).MatchesLayoutInShape(
             arguments[i]->on_host_shape())) {
-      return InvalidArgument(
-          "argument does not match shape or layout of computation parameter "
-          "%d: expected %s, got %s",
+      return InvalidParameterArgument(
+          executable_.get(), i,
+          "Argument does not match shape or layout of computation parameter "
+          "%d: want %s, got %s",
           i,
           ShapeUtil::HumanString(computation_layout.parameter_layout(i).shape())
               .c_str(),
@@ -89,14 +76,14 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     }
   }
 
-  if (options.stream() != nullptr) {
-    if (!options.stream()->ok()) {
+  if (run_options.stream() != nullptr) {
+    if (!run_options.stream()->ok()) {
       return InvalidArgument("stream is uninitialized or in an error state");
     }
 
     // Check stream matches service platform.
     const se::Platform* stream_platform =
-        options.stream()->parent()->platform();
+        run_options.stream()->parent()->platform();
     if (stream_platform != backend_->platform()) {
       return InvalidArgument(
           "stream is for platform %s, but service targets platform %s",
@@ -106,7 +93,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
     // Cannot specify device_ordinal with a stream. The stream determines these
     // values.
-    if (options.device_ordinal() != -1) {
+    if (run_options.device_ordinal() != -1) {
       return InvalidArgument(
           "cannot set both device ordinal and stream options in "
           "ExecutableRunOptions; the stream determines the device ordinal");
@@ -115,34 +102,34 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
   // Verify that the device the executable was built for is equivalent to the
   // device it will run on.
-  int run_device_ordinal = options.device_ordinal() == -1
+  int run_device_ordinal = run_options.device_ordinal() == -1
                                ? backend_->default_device_ordinal()
-                               : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(
-      bool devices_equivalent,
-      backend_->devices_equivalent(run_device_ordinal, build_device_ordinal_));
+                               : run_options.device_ordinal();
+  TF_ASSIGN_OR_RETURN(bool devices_equivalent,
+                      backend_->devices_equivalent(
+                          run_device_ordinal, build_options_.device_ordinal()));
   if (!devices_equivalent) {
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * run_executor,
                         backend_->stream_executor(run_device_ordinal));
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * build_executor,
-                        backend_->stream_executor(build_device_ordinal_));
+                        backend_->stream_executor(build_device_ordinal()));
     return InvalidArgument(
         "executable is built for device %s of type \"%s\"; cannot run it on "
         "device %s of type \"%s\"",
-        backend_->device_name(build_device_ordinal_).c_str(),
+        backend_->device_name(build_device_ordinal()).c_str(),
         build_executor->GetDeviceDescription().name().c_str(),
         backend_->device_name(run_device_ordinal).c_str(),
         run_executor->GetDeviceDescription().name().c_str());
   }
 
-  if (!options.allocator()) {
+  if (!run_options.allocator()) {
     return InvalidArgument("an allocator must be provided to ExecuteLocally");
   }
 
-  if (options.allocator()->platform() != backend.platform()) {
+  if (run_options.allocator()->platform() != backend.platform()) {
     return InvalidArgument(
         "allocator platform (%s) does not match service platform (%s)",
-        options.allocator()->platform()->Name().c_str(),
+        run_options.allocator()->platform()->Name().c_str(),
         backend.platform()->Name().c_str());
   }
 
@@ -151,23 +138,22 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options) {
-  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
-
-  ExecutableRunOptions actual_options = options;
+    ExecutableRunOptions run_options) {
+  TF_RETURN_IF_ERROR(
+      ValidateExecutionOptions(arguments, run_options, *backend_));
 
   Backend::StreamPtr stream;
-  if (options.stream() == nullptr) {
+  if (run_options.stream() == nullptr) {
     // NB!  The lifetime of `stream` needs to match the lifetime of
     // `actual_options` (otherwise we will end up using a returned stream in
     // ExecuteOnStreamWrapper), which is why it isn't declared in the inner "if"
     // scope.
     TF_ASSIGN_OR_RETURN(
-        stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
-    actual_options.set_stream(stream.get());
+        stream, BorrowStreamForDevice(run_options.device_ordinal(), backend_));
+    run_options.set_stream(stream.get());
   }
-  if (options.allocator() == nullptr) {
-    actual_options.set_allocator(backend_->memory_allocator());
+  if (run_options.allocator() == nullptr) {
+    run_options.set_allocator(backend_->memory_allocator());
   }
 
   // For local client execution on CPU backends:
@@ -176,7 +162,7 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
   ServiceExecutableRunOptions service_options(
-      actual_options, backend_->StreamBorrower(),
+      run_options, backend_->StreamBorrower(),
       backend_->eigen_intra_op_thread_pool());
 
   if (executable_->dumping()) {
@@ -185,9 +171,8 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ShapedBuffer> result,
       executable_->ExecuteOnStreamWrapper(
-          &service_options, options.execution_profile(), arguments));
-  return ScopedShapedBuffer::MakeScoped(result.get(),
-                                        actual_options.allocator());
+          &service_options, run_options.execution_profile(), arguments));
+  return ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator());
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::ExecuteAndDump(
@@ -263,16 +248,19 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
     const Computation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
     const ExecutableBuildOptions& options) {
-  int device_ordinal = options.device_ordinal() == -1
-                           ? default_device_ordinal()
-                           : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      local_service_->CompileExecutable(
-                          computation.handle(), argument_layouts,
-                          options.result_layout(), device_ordinal));
+  ExecutableBuildOptions updated_options = options;
+  if (options.device_ordinal() == -1) {
+    updated_options.set_device_ordinal(default_device_ordinal());
+    VLOG(3) << "Set device ordinal to default value of: "
+            << updated_options.device_ordinal();
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      local_service_->CompileExecutable(computation.handle(), argument_layouts,
+                                        updated_options));
   return WrapUnique(new LocalExecutable(std::move(executable),
                                         local_service_->mutable_backend(),
-                                        device_ordinal, options));
+                                        updated_options));
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>>
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 19fd14f76bc69d528193f7981a51a305f03f987e..b52a30f5a0b92e0094e6b0de3241c10a5a909cad 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -33,39 +34,13 @@ limitations under the License.
 
 namespace xla {
 
-// Class containing options for building an LocalExecutable with
-// LocalClient::Compile.
-class ExecutableBuildOptions {
- public:
-  // If set, this is the device to build the computation for. Valid
-  // device_ordinal values are: 0 to # of devices - 1. These values are
-  // identical to the device ordinal values used by StreamExecutor. The built
-  // executable will be executable on any device equivalent to the specified
-  // device as determined by Backend::devices_equivalent(). A value of -1
-  // indicates this option has not been set.
-  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
-  int device_ordinal() const;
-
-  // If set, this specifies the layout of the result of the computation. If not
-  // set, the service will chose the layout of the result. A Shape is used to
-  // store the layout to accommodate tuple result shapes. A value of nullptr
-  // indicates the option has not been set.
-  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
-  const Shape* result_layout() const;
-
- private:
-  int device_ordinal_ = -1;
-  Shape result_layout_;
-  bool result_layout_set_ = false;
-};
-
 class LocalExecutable {
  public:
   // Run the compiled computation with the given arguments and options and
   // return the result.
   StatusOr<std::unique_ptr<ScopedShapedBuffer>> Run(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options);
+      ExecutableRunOptions run_options);
 
   // Return the layout (contained in a shape) of the result produced by the
   // computation.
@@ -88,8 +63,7 @@ class LocalExecutable {
 
   // Constructor invoked by LocalClient.
   LocalExecutable(std::unique_ptr<Executable> executable, Backend* backend,
-                  int device_ordinal,
-                  const ExecutableBuildOptions& build_options);
+                  ExecutableBuildOptions build_options);
 
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
@@ -117,19 +91,19 @@ class LocalExecutable {
   StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
       const ShapedBuffer& shaped_buffer);
 
+  // The ordinal of the device which this executable was compiled for. The
+  // executable can run on all equivalent devices (as determined by
+  // Backend::devices_equivalent).
+  int build_device_ordinal() const { return build_options_.device_ordinal(); }
+
   // Compiled computation.
   std::unique_ptr<Executable> executable_;
 
   // Execution backend.
-  Backend* backend_;
-
-  // The ordinal of the device which this executable was compiled for. The
-  // executable can run on all equivalent devices (as determined by
-  // Backend::devices_equivalent).
-  int build_device_ordinal_;
+  Backend* backend_ = nullptr;
 
   // Options used to build the executable.
-  const ExecutableBuildOptions& build_options_;
+  const ExecutableBuildOptions build_options_;
 };
 
 // An XLA Client specialization for use when the client and service run in
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index fe3a4d2f6df47d9f156529e55198a5f339bc8e3c..c8ed3e3a2b009ddffdfb79a9a6ced8d5e736bee6 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -221,13 +221,19 @@ void AllocateFlags() {
           flag_values->xla_gpu_disable_multi_streaming(),
           "If true, multi-streaming in the GPU backend is disabled."),
       tensorflow::Flag(
-          "xla_dump_hlo_proto_to", flag_values->mutable_xla_dump_hlo_proto_to(),
-          "Dump compilation artifacts as proto binary into this directory."),
+          "xla_dump_optimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
+          "Dump Hlo after all hlo passes are executed as proto binary into "
+          "this directory."),
       tensorflow::Flag(
-          "xla_dump_prepass_hlo_proto_to",
-          flag_values->mutable_xla_dump_prepass_hlo_proto_to(),
-          "Dump compilation artifacts, before hlo passes are executed, as "
-          "proto binary into this directory."),
+          "xla_dump_unoptimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_unoptimized_hlo_proto_to(),
+          "Dump HLO before any hlo passes are executed as proto binary into "
+          "this directory."),
+      tensorflow::Flag("xla_dump_per_pass_hlo_proto_to",
+                       flag_values->mutable_xla_dump_per_pass_hlo_proto_to(),
+                       "Dump HLO after each pass as an HloProto in binary file "
+                       "format into this directory."),
       tensorflow::Flag(
           "xla_test_all_output_layouts",
           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index e0196509a7483abac3d9c0e59a54b591a327b980..2b68b8f177dcae19f55a45f2f284ea10f7ce6e52 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -486,6 +486,7 @@ class Literal {
       std::vector<std::unique_ptr<Literal>> elements);
 
   // Returns a string representation of the literal value.
+  // Warning: this function can take minutes for multi-million element Literals.
   string ToString(bool print_layout = false) const;
 
   // Invokes the "per cell" callback for each element in the provided
diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h
index 50659c12405f2a29c69b03b3c7de5bd6cb6af9c2..0ad0b9133075fdb29effe1be2c007a851f420a33 100644
--- a/tensorflow/compiler/xla/map_util.h
+++ b/tensorflow/compiler/xla/map_util.h
@@ -16,6 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_MAP_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_MAP_UTIL_H_
 
+#include <functional>
+#include <sstream>
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -44,6 +49,22 @@ typename Collection::value_type::second_type& FindOrDie(
   return it->second;
 }
 
+// Like FindOrDie but returns an error instead of dying if `key` is not in
+// `container`.
+template <class Collection>
+StatusOr<
+    std::reference_wrapper<const typename Collection::value_type::second_type>>
+MaybeFind(const Collection& collection,
+          const typename Collection::value_type::first_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  if (it == collection.end()) {
+    std::ostringstream os;
+    os << key;
+    return NotFound("key not found: %s", os.str().c_str());
+  }
+  return {it->second};
+}
+
 // Inserts the key-value pair into the collection. Dies if key was already
 // present.
 template <class Collection>
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index a8ca0e3ea0115d412e96ebacb320cc0dde061dff..e2972f06016ab3555c4fc0cc4616993fe6764b1e 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 37f1eada2bc9f5ef72d99a835a17b4e78a354ae6..8386acf0cd4ac724ae5fad618df41b1a6bee8cce 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -98,15 +98,25 @@ const std::unique_ptr<ScopedShapedBuffer>& LocalShapedBuffer::shaped_buffer()
   return shaped_buffer_;
 }
 
+static StatusOr<std::unique_ptr<ScopedShapedBuffer>> ToBuffer(
+    LocalClient* client, int device_ordinal, const Literal& arg) {
+  return client->LiteralToShapedBuffer(arg, device_ordinal,
+                                       client->backend().memory_allocator());
+}
+
 /* static */
-LocalShapedBuffer* LocalShapedBuffer::FromLiteral(const Literal& argument) {
+LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
+    const Literal& argument,
+    const tensorflow::gtl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
-  std::unique_ptr<ScopedShapedBuffer> buf =
-      client
-          ->LiteralToShapedBuffer(argument,
-                                  /*device_ordinal=*/0,
-                                  client->backend().memory_allocator())
-          .ConsumeValueOrDie();
+  std::unique_ptr<ScopedShapedBuffer> buf;
+  if (shape_with_layout) {
+    std::unique_ptr<Literal> relaid =
+        argument.Relayout(shape_with_layout.value());
+    buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie();
+  } else {
+    buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+  }
   return new LocalShapedBuffer(std::move(buf));
 }
 
@@ -120,7 +130,8 @@ CompiledLocalComputation::CompiledLocalComputation(
     : executable_(std::move(executable)) {}
 
 StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
-    const std::vector<Literal>& arguments) {
+    const std::vector<Literal>& arguments,
+    const std::vector<tensorflow::gtl::optional<Shape>>& shapes_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
 
   VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
@@ -133,7 +144,8 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
                                         GetReplicaCount());
 
     for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-      pool.Schedule([this, client, replica, &arguments, &results] {
+      pool.Schedule([this, client, replica, &arguments, &shapes_with_layout,
+                     &results] {
         StatusOr<int> device_ordinal_status =
             client->ReplicaNumberToDeviceOrdinal(replica);
         if (!device_ordinal_status.ok()) {
@@ -144,18 +156,28 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         VLOG(3) << "Replica " << replica
                 << " mapped to device ordinal for execution: "
                 << device_ordinal;
+
         // Transfer arguments in
         std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
         scoped_buffers.reserve(arguments.size());
-        for (const Literal& argument : arguments) {
-          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed =
-              client->LiteralToShapedBuffer(
-                  argument, device_ordinal,
-                  client->backend().memory_allocator());
+        for (int i = 0; i < arguments.size(); ++i) {
+          const Literal& argument = arguments[i];
+          const tensorflow::gtl::optional<Shape>& shape_with_layout =
+              shapes_with_layout[i];
+
+          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed;
+          if (shape_with_layout) {
+            std::unique_ptr<Literal> relaid =
+                argument.Relayout(shape_with_layout.value());
+            pushed = ToBuffer(client, device_ordinal, *relaid);
+          } else {
+            pushed = ToBuffer(client, device_ordinal, argument);
+          }
           if (!pushed.ok()) {
             results[replica] = pushed.status();
             return;
           }
+
           scoped_buffers.push_back(std::move(pushed).ValueOrDie());
         }
 
@@ -233,7 +255,8 @@ LocalComputation::LocalComputation(Computation computation)
     : computation_(std::move(computation)) {}
 
 StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
-    const std::vector<Shape>& argument_shapes) {
+    const std::vector<Shape>& argument_shapes,
+    const ExecutableBuildOptions* build_options) {
   std::vector<const Shape*> argument_shape_pointers;
   argument_shape_pointers.reserve(argument_shapes.size());
   for (auto& argument_shape : argument_shapes) {
@@ -242,6 +265,9 @@ StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
 
   LocalClient* client = GetOrCreateLocalClient();
   ExecutableBuildOptions options;
+  if (build_options != nullptr) {
+    options = *build_options;
+  }
   TF_ASSIGN_OR_RETURN(
       auto local_executable,
       client->Compile(computation_, argument_shape_pointers, options));
@@ -363,12 +389,6 @@ LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
       source, init_value, scatter.computation());
 }
 
-ComputationDataHandle LocalComputationBuilder::Select(
-    const ComputationDataHandle& pred, const ComputationDataHandle& on_true,
-    const ComputationDataHandle& on_false) {
-  return builder_.Select(pred, on_true, on_false);
-}
-
 ComputationDataHandle LocalComputationBuilder::Tuple(
     tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
   return builder_.Tuple(elements);
@@ -384,6 +404,12 @@ ComputationDataHandle LocalComputationBuilder::Dot(
   return builder_.Dot(lhs, rhs);
 }
 
+ComputationDataHandle LocalComputationBuilder::DotGeneral(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  return builder_.DotGeneral(lhs, rhs, dimension_numbers);
+}
+
 ComputationDataHandle LocalComputationBuilder::ConvGeneralDilated(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
@@ -483,6 +509,15 @@ ComputationDataHandle LocalComputationBuilder::While(
        tensorflow::gtl::ArraySlice<int64> broadcast_dimensions),           \
       (lhs, rhs, broadcast_dimensions))
 
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs),                                  \
+      (lhs, rhs, ehs))
+
+_FORWARD_TRIOP(Select)
+_FORWARD_TRIOP(Clamp)
 _FORWARD_BINOP(Eq)
 _FORWARD_BINOP(Ne)
 _FORWARD_BINOP(Ge)
@@ -503,6 +538,7 @@ _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
 _FORWARD_UNOP(Floor)
 _FORWARD_UNOP(Ceil)
+_FORWARD_UNOP(Round)
 _FORWARD_UNOP(Log)
 _FORWARD_UNOP(Sign)
 _FORWARD_UNOP(Cos)
@@ -519,6 +555,7 @@ _FORWARD_UNOP(Sort)
 #undef _FORWARD
 #undef _FORWARD_UNOP
 #undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
 
 void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
   delete local_shaped_buffer;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index e5503cd52fa60eff30eea38c83aafe0f0ff1efc8..f39d15cff70f795dccd64825cbb142a173790a50 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -58,7 +59,9 @@ StatusOr<std::unique_ptr<Literal> > TransferFromOutfeedLocalReplica(
 // client.
 class LocalShapedBuffer {
  public:
-  static LocalShapedBuffer* FromLiteral(const Literal& argument);
+  static LocalShapedBuffer* FromLiteral(
+      const Literal& argument,
+      const tensorflow::gtl::optional<Shape>& shape_with_layout);
   LocalShapedBuffer(std::unique_ptr<ScopedShapedBuffer> shaped_buffer);
   const std::unique_ptr<ScopedShapedBuffer>& shaped_buffer() const;
   std::unique_ptr<Literal> ToLiteral() const;
@@ -76,8 +79,15 @@ class LocalShapedBuffer {
 class CompiledLocalComputation {
  public:
   CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+
+  // Execute the computation with the given argument literals, and
+  // with optionally-specified argument layouts. The literals will be
+  // re-laid out according to the corresponding elements of
+  // shapes_with_layout.
   StatusOr<std::unique_ptr<Literal> > Execute(
-      const std::vector<Literal>& arguments);
+      const std::vector<Literal>& arguments,
+      const std::vector<tensorflow::gtl::optional<Shape> >& shapes_with_layout);
+
   LocalShapedBuffer* ExecuteWithShapedBuffers(
       tensorflow::gtl::ArraySlice<LocalShapedBuffer*> argument_handles);
 
@@ -93,7 +103,8 @@ class LocalComputation {
  public:
   LocalComputation(Computation computation);
   StatusOr<CompiledLocalComputation*> Compile(
-      const std::vector<Shape>& argument_shapes);
+      const std::vector<Shape>& argument_shapes,
+      const ExecutableBuildOptions* build_options);
   const Computation& computation() const;
 
  private:
@@ -172,10 +183,6 @@ class LocalComputationBuilder {
       const ComputationDataHandle& source,
       const ComputationDataHandle& init_value, const LocalComputation& scatter);
 
-  ComputationDataHandle Select(const ComputationDataHandle& pred,
-                               const ComputationDataHandle& on_true,
-                               const ComputationDataHandle& on_false);
-
   ComputationDataHandle Tuple(
       tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
 
@@ -185,6 +192,10 @@ class LocalComputationBuilder {
   ComputationDataHandle Dot(const ComputationDataHandle& lhs,
                             const ComputationDataHandle& rhs);
 
+  ComputationDataHandle DotGeneral(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
   ComputationDataHandle ConvGeneralDilated(
       const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
@@ -252,6 +263,14 @@ class LocalComputationBuilder {
       (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
        tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
 
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs))
+
+  _FORWARD_TRIOP(Select)
+  _FORWARD_TRIOP(Clamp)
   _FORWARD_BINOP(Eq)
   _FORWARD_BINOP(Ne)
   _FORWARD_BINOP(Ge)
@@ -272,6 +291,7 @@ class LocalComputationBuilder {
   _FORWARD_UNOP(Exp)
   _FORWARD_UNOP(Floor)
   _FORWARD_UNOP(Ceil)
+  _FORWARD_UNOP(Round)
   _FORWARD_UNOP(Log)
   _FORWARD_UNOP(Sign)
   _FORWARD_UNOP(Cos)
@@ -288,6 +308,7 @@ class LocalComputationBuilder {
 #undef _FORWARD
 #undef _FORWARD_UNOP
 #undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
 
  private:
   ComputationBuilder builder_;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 31789259609714e7d20247eec072e05a181715e6..5ea75550c96dfb3c1a4534dc1c6c171774c598cd 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -27,12 +27,14 @@ limitations under the License.
 //  ArraySlice<ComputationDataHandle>  <-  sequence of int
 //  Literal                            <-> (nested tuple of) numpy ndarray
 //  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
-//  Shape                              <-> pair holding (dtype, dimensions)
-//  std::vector<Shape>                 <-  sequence of shape information pairs
+//  Shape                               -> pair holding (dtype, dimensions)
+//                                     <-  object duck-typed as xla_client.Shape
+//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
 //  PrimitiveType                      <-  int
 //  ArraySlice<pair<int64, in64>>      <-  sequence of int pairs
 //  PaddingConfig proto                <-  corresponding Python proto
 //  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
+//  DotDimensionNumbers proto          <-  corresponding Python proto
 //
 // Arrows indicate whether a conversion only ever occurs in one
 // direction, or whether it is maintained bidirectionally.
@@ -55,7 +57,7 @@ limitations under the License.
 // translates to a tuple-shaped XLA Literal, whose component subshapes
 // are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
 //
-// The Python objects corresponding to C++ Shapes have the type:
+// Shapes output by C++ become Python objects with the type:
 //
 //   T            = (dtype, S)
 //   S            = DIMENSIONS | TUPLE_SHAPES
@@ -176,6 +178,16 @@ tensorflow::ImportNumpy();
   }
 }
 
+%typemap(out) StatusOr< std::unique_ptr<Literal> > {
+  if ($1.ok()) {
+    std::unique_ptr<Literal> value = $1.ConsumeValueOrDie();
+    $result = numpy::PyObjectFromXlaLiteral(*value);
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+}
+
 %typemap(out) StatusOr<xla::swig::LocalComputation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
@@ -343,15 +355,31 @@ tensorflow::ImportNumpy();
 // Shape
 
 %typemap(in) const Shape& (Shape temp) {
-  Status shape_status = numpy::CheckPyShapeInfo($input);
-  if (!shape_status.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str());
+  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
     return NULL;
   }
-  temp = numpy::XlaShapeFromPyShapeInfo($input);
+  temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
 }
 
+%typemap(in) const tensorflow::gtl::optional<Shape>& (
+    tensorflow::gtl::optional<Shape> temp) {
+  if ($input == Py_None) {
+    temp = tensorflow::gtl::nullopt;
+    $1 = &temp;
+  } else {
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      return NULL;
+    }
+    temp = std::move(statusor).ValueOrDie();
+    $1 = &temp;
+  }
+}
+
 %typemap(out) std::unique_ptr<Shape> {
   $result = numpy::PyShapeInfoFromXlaShape(*$1);
 }
@@ -364,14 +392,37 @@ tensorflow::ImportNumpy();
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    Status shape_status = numpy::CheckPyShapeInfo(o);
-    if (!shape_status.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str());
-      Py_DECREF(o);
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+    Py_DECREF(o);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
       return NULL;
     }
-    temps.push_back(numpy::XlaShapeFromPyShapeInfo(o));
-    Py_DECREF(o);
+    temps.push_back(statusor.ConsumeValueOrDie());
+  }
+  $1 = &temps;
+}
+
+%typemap(in) const std::vector<tensorflow::gtl::optional<Shape> >& (
+    std::vector<tensorflow::gtl::optional<Shape> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (o == Py_None) {
+      temps.push_back(tensorflow::gtl::nullopt);
+    } else {
+      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+      Py_DECREF(o);
+      if (!statusor.ok()) {
+        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+        return NULL;
+      }
+      temps.push_back(statusor.ConsumeValueOrDie());
+    }
   }
   $1 = &temps;
 }
@@ -461,6 +512,135 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
+// DotDimensionNumbers
+
+%typemap(in) const DotDimensionNumbers&
+    (DotDimensionNumbers dimension_numbers) {
+  int length;
+
+  /* lhs_contracting_dimensions */
+  PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "lhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_contracting_dimensions);
+
+  /* rhs_contracting_dimensions */
+  PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "rhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_contracting_dimensions);
+
+  /* lhs_batch_dimensions */
+  PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "lhs_batch_dimensions");
+  if (!lhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_batch_dimensions);
+
+  /* rhs_batch_dimensions */
+  PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "rhs_batch_dimensions");
+  if (!rhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_batch_dimensions);
+
+  $1 = &dimension_numbers;
+}
+
 // PaddingConfig
 
 %typemap(in) const PaddingConfig&
@@ -623,6 +803,30 @@ tensorflow::ImportNumpy();
   $1 = &dimension_numbers;
 }
 
+// ExecutableBuildOptions
+
+%typemap(in) const ExecutableBuildOptions*
+    (ExecutableBuildOptions build_options) {
+  if ($input == Py_None) {
+    $1 = NULL;
+  } else {
+    PyObject* o = PyObject_GetAttrString($input, "generate_hlo_graph");
+    if (!o) {
+      return NULL;
+    }
+    if (o != Py_None) {
+      if (!PyString_Check(o)) {
+        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.generate_hlo_graph must be a string or None.");
+        return NULL;
+      }
+      build_options.set_generate_hlo_graph(PyString_AsString(o));
+    }
+    Py_DECREF(o);
+
+    $1 = &build_options;
+  }
+}
+
 %ignoreall
 %unignore xla;
 %unignore xla::swig;
@@ -667,6 +871,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Call;
 %unignore xla::swig::LocalComputationBuilder::Transpose;
 %unignore xla::swig::LocalComputationBuilder::Rev;
+%unignore xla::swig::LocalComputationBuilder::Clamp;
 %unignore xla::swig::LocalComputationBuilder::Map;
 %unignore xla::swig::LocalComputationBuilder::Reduce;
 %unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding;
@@ -681,6 +886,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Lt;
 %unignore xla::swig::LocalComputationBuilder::Le;
 %unignore xla::swig::LocalComputationBuilder::Dot;
+%unignore xla::swig::LocalComputationBuilder::DotGeneral;
 %unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated;
 %unignore xla::swig::LocalComputationBuilder::Add;
 %unignore xla::swig::LocalComputationBuilder::Sub;
@@ -696,6 +902,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Exp;
 %unignore xla::swig::LocalComputationBuilder::Floor;
 %unignore xla::swig::LocalComputationBuilder::Ceil;
+%unignore xla::swig::LocalComputationBuilder::Round;
 %unignore xla::swig::LocalComputationBuilder::Log;
 %unignore xla::swig::LocalComputationBuilder::Sign;
 %unignore xla::swig::LocalComputationBuilder::Cos;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 5c722623e318ece9eca6bdc8750195ce5fd5defb..3d87480728aab1d4ebbc71c6c7504d37cae5edaf 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -176,85 +176,107 @@ static string PyObjectCppRepr(PyObject* o) {
   return ExtractStringAndDecref(r);
 }
 
-Status CheckPyShapeInfo(PyObject* o) {
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
   auto error = [o](const string& prefix) {
     return InvalidArgument("%s; got %s", prefix.c_str(),
                            PyObjectCppRepr(o).c_str());
   };
-  // The object is a tuple (a pair)
-  if (!PyTuple_Check(o)) {
-    return error("Shape record must be a tuple");
-  }
-  if (PyTuple_Size(o) != 2) {
-    return error("Shape record tuple must be of length 2");
-  }
 
-  // It has a first element, which is a numpy dtype object
-  PyObject* first = PyTuple_GetItem(o, 0);
-  if (first == nullptr) {
-    return error("Tuple has no item 0 (shape dtype)");
-  }
-  if (first->ob_type != &PyArrayDescr_Type) {
-    return error(
-        "Shape record does not have a numpy dtype as its first element");
-  }
-  const int np_type = NumpyTypenum(first);
-  if (!NumpyTypeIsValid(np_type)) {
-    return error("Shape record has an invalid integer dtype");
-  }
+  auto get_attr = [o, &error](const string& field) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_GetAttrString(o, const_cast<char*>(field.c_str()));
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to get attribute of Shape object:", field));
+    }
+    return result;
+  };
 
-  // It has a second element, which is a tuple, either of shape
-  // records or of Python ints
-  PyObject* second = PyTuple_GetItem(o, 1);
-  if (!second) {
-    return error("Tuple has no item 0 (shape dimensions)");
-  }
-  if (!PyTuple_Check(second)) {
-    return error("Shape record does not have a tuple as its second element");
-  }
-  const int length = PyTuple_Size(second);
-  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
-  for (int i = 0; i < length; i++) {
-    PyObject* dimension = PyTuple_GetItem(second, i);
-    if (element_type == TUPLE) {
-      VLOG(3) << "element_type is tuple, checking member: " << i;
-      Status result = CheckPyShapeInfo(dimension);
-      if (!result.ok()) {
-        return AddStatus(
-            result, tensorflow::strings::StrCat("Validating tuple member ", i,
-                                                " of ", PyObjectCppRepr(o)));
-      }
-    } else if (!CheckPyIntOrLong(dimension)) {
-      return error("Non-tuple shape record has a non-integer dimension");
+  auto call_method = [o, &error](const string& method) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to call method of shape object:", method));
     }
-  }
+    return result;
+  };
 
-  return Status::OK();
-}
+  PyObject* np_type;
+  TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype"));
+  if (np_type->ob_type != &PyArrayDescr_Type) {
+    return error("Shape attribute np_dtype is not an integer numpy dtype");
+  }
+  if (!NumpyTypeIsValid(NumpyTypenum(np_type))) {
+    return error("Shape attribute np_dtype is not a valid integer numpy dtype");
+  }
+  const PrimitiveType element_type =
+      NumpyTypeToPrimitiveType(NumpyTypenum(np_type));
+  Py_DECREF(np_type);
 
-// Precondition: CheckPyShapeInfo(o)
-Shape XlaShapeFromPyShapeInfo(PyObject* o) {
-  const int np_type = NumpyTypenum(PyTuple_GetItem(o, 0));
-  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
-  PyObject* py_dimensions = PyTuple_GetItem(o, 1);
-  const int length = PyTuple_Size(py_dimensions);
   if (element_type == TUPLE) {
+    PyObject* py_subshapes;
+    TF_ASSIGN_OR_RETURN(py_subshapes, call_method("tuple_shapes"));
+    if (!PyTuple_Check(py_subshapes)) {
+      return error(
+          "Return value of Shape method tuple_shapes() is not a tuple");
+    }
+    const int length = PyTuple_Size(py_subshapes);
     std::vector<Shape> subshapes;
     subshapes.reserve(length);
     for (int i = 0; i < length; i++) {
-      subshapes.push_back(
-          XlaShapeFromPyShapeInfo(PyTuple_GetItem(py_dimensions, i)));
+      TF_ASSIGN_OR_RETURN(
+          const Shape& subshape,
+          XlaShapeFromPyShape(PyTuple_GetItem(py_subshapes, i)));
+      subshapes.push_back(subshape);
     }
+    Py_DECREF(py_subshapes);
     return ShapeUtil::MakeTupleShape(subshapes);
   } else {
+    PyObject* py_dimensions;
+    PyObject* py_minor_to_major;
+    TF_ASSIGN_OR_RETURN(py_dimensions, call_method("dimensions"));
+    TF_ASSIGN_OR_RETURN(py_minor_to_major, call_method("minor_to_major"));
+    if (!PyTuple_Check(py_dimensions)) {
+      return error("Return value of Shape method dimensions() is not a tuple");
+    }
+    if (py_minor_to_major != Py_None && !PyTuple_Check(py_minor_to_major)) {
+      return error(
+          "Return value of Shape method minor_to_major() is neither a tuple "
+          "nor None");
+    }
+    const int length = PyTuple_Size(py_dimensions);
+    if (py_minor_to_major != Py_None &&
+        length != PyTuple_Size(py_minor_to_major)) {
+      return error(
+          "Shape methods dimensions() and minor_to_major() return "
+          "different-length tuples");
+    }
     std::vector<int64> dimensions(length);
+    std::vector<int64> minor_to_major(length);
     for (int i = 0; i < length; i++) {
       dimensions[i] = PyIntOrPyLongToLong(PyTuple_GetItem(py_dimensions, i));
-      if (dimensions[i] == -1) {
-        CHECK(!PyErr_Occurred());
+      if (dimensions[i] == -1 && PyErr_Occurred()) {
+        return error("Dimension is not an int");
       }
+
+      if (py_minor_to_major != Py_None) {
+        minor_to_major[i] =
+            PyIntOrPyLongToLong(PyTuple_GetItem(py_minor_to_major, i));
+        if (minor_to_major[i] == -1 && PyErr_Occurred()) {
+          return error("Minor-to-major value is not an int");
+        }
+      }
+    }
+    bool with_layout = py_minor_to_major != Py_None;
+    Py_DECREF(py_dimensions);
+    Py_DECREF(py_minor_to_major);
+    if (with_layout) {
+      return ShapeUtil::MakeShapeWithLayout(element_type, dimensions,
+                                            minor_to_major);
+    } else {
+      return ShapeUtil::MakeShape(element_type, dimensions);
     }
-    return ShapeUtil::MakeShape(element_type, dimensions);
   }
 }
 
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 6ff1c34cfc5e0323a6729bdfd5572239f4966211..adfcc3b8588dce01718bb19dea936bace483be4d 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -56,15 +56,11 @@ bool NumpyTypeIsValid(int np_type);
 // The return value is a new reference.
 PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
 
-// Returns the outcome of a best-effort check that the Python object
-// is a pair of the form (numpy dtype, dimensions), as produced by
-// PyShapeInfoFromXlaShape.
-Status CheckPyShapeInfo(PyObject* o);
-
-// Performs the inverse conversion to that of PyShapeInfoFromXlaShape.
+// Converts a Python object with a method interface mathing that of
+// xla_client.Shape into an XLA Shape object.
 //
 // The return value is a new reference.
-Shape XlaShapeFromPyShapeInfo(PyObject* o);
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o);
 
 // Converts a PyObject that represents operation metadata into protocol buffer
 // form.
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 9cfe1249f50fd3c4b09d5af0c0e17a6f40b024a2..b890980955250a36b758ac0a1f0fd8f22296c0bc 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -36,15 +36,22 @@ from tensorflow.compiler.xla.python import pywrap_xla as c_api
 # pylint: disable=invalid-name
 
 
-OpMetadata = collections.namedtuple(
-    'OpMetadata',
-    [
-        'op_type',
-        'op_name',
-        'source_file',
-        'source_line',
-    ],
-)
+_OP_METADATA_FIELDS = [
+    'op_type',
+    'op_name',
+    'source_file',
+    'source_line',
+]
+OpMetadata = collections.namedtuple('OpMetadata', _OP_METADATA_FIELDS)
+
+
+def OpMetadataToProto(pyobj):
+  proto = xla_data_pb2.OpMetadata()
+  for field in _OP_METADATA_FIELDS:
+    attr = getattr(pyobj, field)
+    if attr is not None:
+      setattr(proto, field, attr)
+  return proto
 
 
 def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
@@ -82,6 +89,7 @@ _UNARY_OPS = [
     'Abs',
     'Exp',
     'Floor',
+    'Round',
     'Ceil',
     'Log',
     'Sign',
@@ -148,9 +156,14 @@ class LocalBuffer(object):
     self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_py(npval):
+  def from_py(npval, layout_fn=None):
     npval = require_numpy_array_layout(npval)
-    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval))
+    if layout_fn:
+      shape = Shape.from_numpy(npval)
+      shape = shape.map_leaves(layout_fn)
+    else:
+      shape = None
+    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape))
 
   def to_py(self):
     return self.c_local_shaped_buffer.ToLiteral()
@@ -175,13 +188,17 @@ class Shape(object):
   represents an XLA tuple.
   """
 
-  def __init__(self, np_dtype, dimensions):
+  def __init__(self, np_dtype, dimensions, minor_to_major=None):
+    assert isinstance(dimensions, tuple)
     self.np_dtype = np_dtype
     self._dimensions = dimensions
+    self._minor_to_major = minor_to_major
+    self._check_minor_to_major()
 
   def __repr__(self):
-    return 'xla_client.Shape(np_dtype={!r}, dimensions={!r})'.format(
-        self.np_dtype, self._dimensions)
+    return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, '
+            'minor_to_major={!r})').format(self.np_dtype, self._dimensions,
+                                           self._minor_to_major)
 
   def element_type(self):
     return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
@@ -194,11 +211,49 @@ class Shape(object):
       raise ValueError('Tuple shape has no dimensions')
     return self._dimensions
 
+  def minor_to_major(self):
+    return self._minor_to_major
+
   def tuple_shapes(self):
     if not self.is_tuple():
       raise ValueError('Shape is not a tuple shape')
     return self._dimensions
 
+  def rank(self):
+    return len(self.dimensions())
+
+  def map_leaves(self, f):
+    """Map f over each leaf-level array subshape.
+
+    Args:
+      f: The function to apply. Whenever f returns None, the identity is
+        applied instead.
+
+    Returns:
+      A new Shape with the mapped leaves.
+    """
+    if self.is_tuple():
+      children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
+      return Shape(np.dtype('O'), children)
+    else:
+      mapped = f(self)
+      return self if mapped is None else mapped
+
+  def _check_minor_to_major(self):
+    mtm = self._minor_to_major
+    if self.is_tuple():
+      assert mtm is None, self
+    if mtm is not None:
+      assert self.rank() == len(mtm), self
+      assert sorted(mtm) == range(len(mtm)), self
+
+  def update_minor_to_major(self, minor_to_major):
+    if not isinstance(minor_to_major, tuple):
+      raise TypeError('minor_to_major must be a tuple')
+    updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major)
+    updated._check_minor_to_major()  # pylint: disable=protected-access
+    return updated
+
   @staticmethod
   def from_numpy(npval):
 
@@ -215,23 +270,10 @@ def _wrap_shape(shape_info):
   dtype, dims = shape_info
   element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
   if element_type == xla_data_pb2.TUPLE:
-    dims = [_wrap_shape(subshape_info) for subshape_info in dims]
+    dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
   return Shape(dtype, dims)
 
 
-def _unwrap_shape(shape):
-  if shape.is_tuple():
-    components = tuple(
-        _unwrap_shape(subshape) for subshape in shape.tuple_shapes())
-  else:
-    components = shape.dimensions()
-  return (shape.np_dtype, components)
-
-
-def _unwrap_shapes(shapes):
-  return [_unwrap_shape(shape) for shape in shapes]
-
-
 def _wrap_data_handle(handle):
   cdh = xla_data_pb2.ComputationDataHandle()
   cdh.handle = handle
@@ -253,6 +295,17 @@ def require_numpy_array_layout(value):
     return np.require(value, requirements=['C', 'A'])
 
 
+class CompileOptions(object):
+  """Python object for XLA compile options.
+
+  These options can be passed to the 'compile' step when using a local XLA
+  client.
+  """
+
+  def __init__(self):
+    self.generate_hlo_graph = None
+
+
 def transfer_to_infeed(value, replica_number=None):
   """Transfers the given value into the XLA infeed queue.
 
@@ -284,8 +337,7 @@ def transfer_from_outfeed(shape, replica_number=None):
   Returns:
     The literal value that is produced from the outfeed queue.
   """
-  return c_api.TransferFromOutfeedLocalReplica(
-      _unwrap_shape(shape), replica_number or 0)
+  return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
 
 
 class LocalComputation(object):
@@ -306,22 +358,39 @@ class LocalComputation(object):
     else:
       self._delete = c_api.DeleteLocalComputation
 
-  def Compile(self, argument_shapes=()):
+  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
     if self.is_compiled:
       raise ValueError('Attempt to compile a compiled local XLA computation.')
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
     return LocalComputation(
-        self.c_local_computation.Compile(_unwrap_shapes(argument_shapes)),
+        self.c_local_computation.Compile(argument_shapes, compile_options),
         is_compiled=True)
 
-  def CompileWithExampleArguments(self, arguments=()):
+  def CompileWithExampleArguments(self,
+                                  arguments=(),
+                                  compile_options=None,
+                                  layout_fn=None):
     return self.Compile(
-        argument_shapes=[Shape.from_numpy(arg) for arg in arguments])
+        argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
+        compile_options=compile_options,
+        layout_fn=layout_fn)
 
-  def Execute(self, arguments=()):
+  def Execute(self, arguments=(), layout_fn=None):
+    """Execute with Python values as arguments and return value."""
     if not self.is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
+    argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
+    else:
+      argument_shapes = [None for shape in argument_shapes]
     arguments = tuple(map(require_numpy_array_layout, arguments))
-    return self.c_local_computation.Execute(arguments)
+    return self.c_local_computation.Execute(arguments, argument_shapes)
 
   def ExecuteWithLocalBuffers(self, arguments=()):
     """Execute with LocalBuffer arguments and return value."""
@@ -377,7 +446,7 @@ class ComputationBuilder(object):
     Returns:
       A  ComputationDataHandle message.
     """
-    return _wrap_data_handle(self._client.Infeed(_unwrap_shape(shape)))
+    return _wrap_data_handle(self._client.Infeed(shape))
 
   def Outfeed(self, operand):
     """Enqueues an outfeed op onto the computation.
@@ -386,7 +455,7 @@ class ComputationBuilder(object):
     outfeed queue for subsequent dequeue via the client API.
     """
     self._client.Outfeed(
-        _unwrap_data_handle(operand), _unwrap_shape(self.GetShape(operand)),
+        _unwrap_data_handle(operand), self.GetShape(operand),
         ''.encode('utf-8'))
 
   def Constant(self, value):
@@ -477,8 +546,7 @@ class ComputationBuilder(object):
       parameter_num = next(self._parameter_numbering)
 
     return _wrap_data_handle(
-        self._client.Parameter(
-            parameter_num, _unwrap_shape(shape), name.encode('utf8')))
+        self._client.Parameter(parameter_num, shape, name.encode('utf8')))
 
   def ParameterFromNumpy(self, value, name=None, parameter_num=None):
     """Enqueues a Parameter op onto the computation.
@@ -599,6 +667,13 @@ class ComputationBuilder(object):
     return _wrap_data_handle(
         self._client.Rev(_unwrap_data_handle(operand), dimensions))
 
+  def Clamp(self, min, operand, max):  # pylint: disable=redefined-builtin
+    """Clamp op."""
+    return _wrap_data_handle(
+        self._client.Clamp(_unwrap_data_handle(min),
+                           _unwrap_data_handle(operand),
+                           _unwrap_data_handle(max)))
+
   def SelectAndScatter(self, operand, select, window_dimensions, window_strides,
                        padding, source, init_value, scatter):
     """Select and scatter op, used by the gradient of ReduceWindow.
@@ -818,8 +893,7 @@ class ComputationBuilder(object):
     shape = Shape(self.GetShape(mu).np_dtype, dims)
     return _wrap_data_handle(
         self._client.RngNormal(
-            _unwrap_data_handle(mu), _unwrap_data_handle(sigma),
-            _unwrap_shape(shape)))
+            _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
 
   def RngUniform(self, a, b, dims):
     """Enqueues an RngUniform operation onto the computation.
@@ -839,8 +913,7 @@ class ComputationBuilder(object):
     shape = Shape(self.GetShape(a).np_dtype, dims)
     return _wrap_data_handle(
         self._client.RngUniform(
-            _unwrap_data_handle(a), _unwrap_data_handle(b),
-            _unwrap_shape(shape)))
+            _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
 
   def While(self, cond, body, init):
     """Enqueues a While operation onto the computation.
@@ -858,10 +931,37 @@ class ComputationBuilder(object):
                            _unwrap_data_handle(init)))
 
   def Dot(self, lhs, rhs):
-    """Matrix multiplication between lhs and rhs."""
+    """Enqueues a dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the rank 1 or rank 2 left-hand-side array.
+      rhs: ComputationDataHandle for the rank 1 or rank 2 right-hand-side array.
+
+    Returns: a ComputationDataHandle representing the Dot operation.
+    """
     return _wrap_data_handle(
         self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs)))
 
+  def DotGeneral(self, lhs, rhs, dimension_numbers):
+    """Enqueues a general dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the left-hand-side array.
+      rhs: ComputationDataHandle for the right-hand-side array.
+      dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested
+        tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
+        integers representing the dimensions to treat as contracting dimensions
+        and batch dimensions on each input operand.
+
+    Returns: a ComputationDataHandle representing the DotGeneral operation.
+    """
+    if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers):
+      dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
+    return _wrap_data_handle(
+        self._client.DotGeneral(
+            _unwrap_data_handle(lhs), _unwrap_data_handle(rhs),
+            dimension_numbers))
+
   def Conv(self, lhs, rhs, window_strides, padding):
     """Enqueues a Conv operation onto the computation.
 
@@ -998,3 +1098,13 @@ def GetPaddingConfigFromTriples(triples):
     dimension.edge_padding_high = hi
     dimension.interior_padding = interior
   return padding_config
+
+
+def GetDotDimensionsFromLists(dimension_numbers):
+  (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers
+  dot_dims_proto = xla_data_pb2.DotDimensionNumbers()
+  dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract)
+  dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract)
+  dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch)
+  dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch)
+  return dot_dims_proto
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index c0413b9bbc3b7f8b63e4cf7a8f24980322cffc47..421fba40e3247b51d79531e430722e5adc2d7988 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -444,6 +444,30 @@ class SingleOpTest(LocalComputationTest):
     c.Dot(c.Constant(lhs), c.Constant(rhs))
     self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
 
+  def testDotGeneral(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+    dimension_numbers = (([2], [1]), ([0], [0]))
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
+  def testDotGeneralWithDotDimensionNumbersProto(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+
+    dimension_numbers = xla_client.xla_data_pb2.DotDimensionNumbers()
+    dimension_numbers.lhs_contracting_dimensions.append(2)
+    dimension_numbers.rhs_contracting_dimensions.append(1)
+    dimension_numbers.lhs_batch_dimensions.append(0)
+    dimension_numbers.rhs_batch_dimensions.append(0)
+
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
   def testConvF32Same(self):
     c = self._NewComputation()
     a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
@@ -496,6 +520,12 @@ class SingleOpTest(LocalComputationTest):
     c.Exp(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=np.exp(arr))
 
+  def testRound(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Round(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.round(arr))
+
   def testLog(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
@@ -699,6 +729,23 @@ class SingleOpTest(LocalComputationTest):
     self._ExecuteAndCompareExact(
         c, expected=[[[6, 5], [8, 7]], [[2, 1], [4, 3]]])
 
+  def testClampF32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayF32(-1)),
+        c.Constant(NumpyArrayF32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayF32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, -1, 0, 1, 2, 2])
+
+  # TODO(b/72689392): re-enable when bug S32 resolved
+  def DISABLED_testClampS32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayS32(-1)),
+        c.Constant(NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayS32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, 0, 1, 2, 2])
+
   def testSelect(self):
     c = self._NewComputation()
     c.Select(
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index e23c418809b01c15796e496dcb643bd56c4e6fa1..0f2d0a9e96e20007aa24a22832bdca4f0add372d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -29,6 +29,11 @@ xla_proto_library(
     deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
+xla_proto_library(
+    name = "hlo_profile_printer_data",
+    srcs = ["hlo_profile_printer_data.proto"],
+)
+
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -455,6 +460,7 @@ cc_library(
         ":hlo_proto_util",
         ":platform_util",
         ":session_proto",
+        ":source_map_util",
         ":transfer_manager",
         ":user_computation",
         ":versioned_computation_handle",
@@ -503,6 +509,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -1104,8 +1111,6 @@ cc_library(
         ":hlo",
         ":hlo_evaluator",
         ":hlo_pass",
-        ":tuple_util",
-        ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
     ],
@@ -1150,6 +1155,34 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "implicit_broadcast_remover",
+    srcs = ["implicit_broadcast_remover.cc"],
+    hdrs = ["implicit_broadcast_remover.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "implicit_broadcast_remover_test",
+    srcs = ["implicit_broadcast_remover_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+    ],
+)
+
 cc_library(
     name = "dot_decomposer",
     srcs = ["dot_decomposer.cc"],
@@ -1819,7 +1852,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -1850,6 +1885,7 @@ cc_library(
         ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
+        ":hlo_proto_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -2265,6 +2301,7 @@ cc_library(
     srcs = ["hlo_profile_printer.cc"],
     hdrs = ["hlo_profile_printer.h"],
     deps = [
+        ":hlo_profile_printer_data",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
     ],
@@ -2342,6 +2379,18 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "source_map_util",
+    srcs = ["source_map_util.cc"],
+    hdrs = ["source_map_util.h"],
+    deps = [
+        ":executable",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index ba82e822b216528c28536181059bc2417048de01..fb857559f972a220a19b108baa4c441e09b90e1f 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1618,9 +1618,12 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
         reduce,
         HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
   }
+
   // A Transpose feeding a reduce can simply permute the reduction dimensions
-  // field.
-  if (arg->opcode() == HloOpcode::kTranspose) {
+  // field if the output of the reduce is a vector or scalar. Higher ranked
+  // result may require a transpose of the output.
+  if (ShapeUtil::Rank(reduce->shape()) <= 1 &&
+      arg->opcode() == HloOpcode::kTranspose) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64> new_reduce_dimensions;
     for (auto dim : dimensions) {
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 33fe11b81db1a1db40285d5c77d8900722025d1c..774b11478c6d2faf0eb5db29df3cfd3cc1e98d5b 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -846,14 +846,13 @@ Status BufferAssigner::AssignBuffersForComputation(
       continue;
     }
 
-    if (is_thread_local || instruction->opcode() == HloOpcode::kCustomCall) {
-      // Custom call operations never have reusable buffers. Also we do not
-      // reuse thread-local buffers for now, because they are dynamically
-      // allocated and their lifetimes are hard to compute.
+    if (is_thread_local) {
+      // We do not reuse thread-local buffers for now, because they are
+      // dynamically allocated and their lifetimes are hard to compute.
       BufferAllocation* allocation = assignment->NewAllocation(
           *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
       VLOG(3) << "New allocation #" << allocation->index()
-              << " for thread-local/CustomCall: " << *buffer;
+              << " for thread-local: " << *buffer;
       continue;
     }
 
@@ -998,14 +997,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       auto color = single_colored_set.first;
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
+      HeapSimulator::Options options;
+      options.buffers_to_assign = &single_colored_set.second;
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
           HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                  MakeUnique<LazyBestFitHeap>(alignment)),
                              assignment->module(), module_sequence,
                              assignment->points_to_analysis(),
-                             assignment->buffer_size_,
-                             &single_colored_set.second));
+                             assignment->buffer_size_, options));
       AssignBuffersFromHeapSimulator(result, assignment,
                                      single_colored_set.first);
     }
@@ -1025,14 +1025,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         auto color = single_colored_set.first;
         VLOG(2) << "Simulating heap for color " << color;
         int64 alignment = assignment->color_alignment_(color);
+        HeapSimulator::Options options;
+        options.buffers_to_assign = &single_colored_set.second;
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                    MakeUnique<LazyBestFitHeap>(alignment)),
                                *computation, *instruction_sequence,
                                assignment->points_to_analysis(),
-                               assignment->buffer_size_,
-                               &single_colored_set.second));
+                               assignment->buffer_size_, options));
         AssignBuffersFromHeapSimulator(result, assignment,
                                        single_colored_set.first);
       }
@@ -1359,6 +1360,43 @@ void BufferAssigner::BuildColocatedBufferSets(
                   index, points_to_analysis, &colocated_set);
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
+
+        // Add true_operand and conditional.true_computation.parameter(0) as a
+        // colocated buffer set. Note that this has to be done for each subshape
+        // in the true_operand of the conditional.
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->operand(1)->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> true_set;
+              // Add conditional.true_operand.
+              AddBufferToColocatedSet(conditional_hlo->operand(1), index,
+                                      points_to_analysis, &true_set);
+              // Add conditional.true_computation.parameter_instruction(0).
+              AddBufferToColocatedSet(
+                  conditional_hlo->true_computation()->parameter_instruction(0),
+                  index, points_to_analysis, &true_set);
+              AddSetToColocatedBufferSets(true_set, colocated_buffer_sets);
+            });
+
+        // Add false_operand and conditional.false_computation.parameter(0) as a
+        // colocated buffer set. Note that this has to be done for each subshape
+        // in the false_operand of the conditional.
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->operand(2)->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> false_set;
+              // Add conditional.false_operand.
+              AddBufferToColocatedSet(conditional_hlo->operand(2), index,
+                                      points_to_analysis, &false_set);
+              // Add conditional.false_computation.parameter_instruction(0).
+              AddBufferToColocatedSet(
+                  conditional_hlo->false_computation()->parameter_instruction(
+                      0),
+                  index, points_to_analysis, &false_set);
+              AddSetToColocatedBufferSets(false_set, colocated_buffer_sets);
+            });
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index b9306a8bb09dc4541014716bb0c5e73e3c93ec85..dab73596e1639eed62151197048ee8d29570b20a 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -101,7 +101,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(*program_shape, instance.argument_layouts,
-                           &execution_options));
+                           &execution_options, *user_computation));
 
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
                         computation_tracker_.BuildHloModule(
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index fc67330f5cbdbcb0d1a259d284599916a908d1fe..74fd24edf88d44b2dfdc87556b0af43987e69e08 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -72,8 +72,18 @@ class AotCompilationOptions {
   // Returns the ID of the platform to which these options apply.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
+  // Optional allocator that may be used for allocating temp space on the device
+  // during compilation.
+  DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
+  void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+    device_allocator_ = device_allocator;
+  }
+
  protected:
   AotCompilationOptions() = default;
+
+ private:
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
 };
 
 // Abstract compiler interface that is subclassed for compilation on a
@@ -99,9 +109,16 @@ class Compiler {
 
   // Runs Hlo passes to optimize the given Hlo module, returns the optimized
   // module.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // space on the device for use during compilation.  For example, the compiler
+  // may allocate buffers on the device and then run variants of a given
+  // algorithm over those buffers, to see which variant is fastest.  Any space
+  // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
@@ -112,21 +129,27 @@ class Compiler {
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // TODO(b/68666782): Remove this method after adding support for multiple
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) = 0;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index f0507982b3749b179dbd7d76c46d39a209640661..d13a97bcc9a84afb22556389b4cdcd985f58d445 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -437,7 +437,8 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/) {
+    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
 
@@ -450,7 +451,8 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec) {
+    perftools::gputools::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
   XLA_SCOPED_LOGGING_TIMER(timer_message);
@@ -485,7 +487,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
   std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-  std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
   if (module->config().hlo_profiling_enabled()) {
     hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
 
@@ -505,8 +507,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
     HloCostAnalysis cost_analysis(shape_size_bytes);
     TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
-    hlo_profile_printer =
-        CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
+    hlo_profile_printer_data =
+        CreateHloProfilePrinterData(*hlo_profile_index_map, cost_analysis);
     computation_to_profile_idx =
         hlo_profile_index_map->computation_to_profile_idx();
   }
@@ -517,8 +519,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // ownership is std::moved.
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
   if (options::CpuParallelBackendRequested(module->config())) {
     VLOG(1) << "Using parallel cpu backend";
@@ -538,10 +540,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     // If we are using the parallel CPU backend, we need to create map from
@@ -619,7 +621,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     cpu_executable.reset(new ParallelCpuExecutable(
         std::move(jit), std::move(assignment), std::move(module),
         std::move(function_names), std::move(aligned_constants),
-        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
+        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -647,10 +649,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     // Each computation is a single function.  Emit all embedded computations
@@ -698,7 +700,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new CpuExecutable(
         std::move(jit), std::move(assignment), std::move(module), function_name,
-        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
+        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -826,12 +828,12 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    const string xla_dump_hlo_proto_to =
-        module->config().debug_options().xla_dump_hlo_proto_to();
-    if (!xla_dump_hlo_proto_to.empty()) {
+    const string xla_dump_optimized_hlo_proto_to =
+        module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index ebed7058d8f7968c6e03ef90d0da6b2325037eb0..3498139ab95d21383c6dc008ae5614b7bfe91148 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -118,11 +118,13 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index f335bd1bbc7376d1cccc0fa6aa1c0a6d6ad559ab..802d0a6fb46890b31d14b1fbf3b2e7d6520caccc 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -55,9 +55,9 @@ CpuExecutable::CpuExecutable(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<const HloModule> hlo_module,
     const string& entry_function_name,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 50443a59954e222f65fc935e83effdaf6d6c8bf0..267b89a10b3c038dc2048f0ad5b5b343c88ef0f9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -51,7 +51,7 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<const HloModule> hlo_module,
                 const string& entry_function_name,
-                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index b03a9f9aa52493a78d40f33319a315fd1763ef11..0b2d3d47463b745049807e9afa55360434ad522b 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -478,7 +479,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
 
 Status IrEmitter::HandleSort(HloInstruction* sort) {
   // TODO(b/26783907): Implement sort on CPU.
-  return Unimplemented("Sort is not supported on CPU (b/26783907).");
+  return Unimplemented("Sort is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleTuple(HloInstruction* tuple) {
@@ -521,7 +522,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for reduce-window not implemented on CPU. See b/31410564.");
+        "Dilation for ReduceWindow is not implemented on CPU.");
   }
 
   // The called computation should have been emitted previously.
@@ -624,8 +625,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // TODO(b/31410564): Implement dilation for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on CPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter is not implemented on CPU. ");
   }
 
   // The select and scatter computations should have been emitted previously.
@@ -1195,8 +1195,7 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   }
 
   // TODO(b/33011107): Support cross replica sum on CPU.
-  return Unimplemented(
-      "Cross replica sum is not implemented on CPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on CPU.");
 }
 
 // Fills up the free variables in 'index_with_free_var' with values from
@@ -1271,6 +1270,52 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
+// Returns true if the relative order of the unreduced dimensions stays the same
+// through the reduce operation.
+static bool ReductionPreservesLayout(const HloInstruction& reduce) {
+  DCHECK_EQ(reduce.opcode(), HloOpcode::kReduce);
+
+  // Maps dimensions that were not reduced from their dimension numbers in the
+  // source shape to their dimensions numbers in the destination shape.
+  //
+  // So if we reduce f32[A,B,C,D] on dimensions 1 and 2, this map contains
+  // [0->0, 3->1].
+  gtl::FlatMap<int64, int64> unreduced_dim_map;
+
+  gtl::FlatSet<int64> reduced_dims(reduce.dimensions().begin(),
+                                   reduce.dimensions().end());
+
+  const Shape& operand_shape = reduce.operand(0)->shape();
+  const Shape& result_shape = reduce.shape();
+
+  int64 delta = 0;
+  for (int64 i = 0; i < operand_shape.dimensions_size(); i++) {
+    if (reduced_dims.count(i)) {
+      delta++;
+    } else {
+      InsertOrDie(&unreduced_dim_map, i, i - delta);
+    }
+  }
+
+  // Iterate dimensions minor to major and check that the corresponding
+  // dimensions in the source and target shapes are equivalent.
+  int64 result_dim_idx = 0;
+  for (int64 operand_dim_idx = 0;
+       operand_dim_idx < operand_shape.dimensions_size(); operand_dim_idx++) {
+    int64 operand_dim = operand_shape.layout().minor_to_major(operand_dim_idx);
+    if (!reduced_dims.count(operand_dim)) {
+      if (FindOrDie(unreduced_dim_map, operand_dim) !=
+          result_shape.layout().minor_to_major(result_dim_idx++)) {
+        return false;
+      }
+    }
+  }
+
+  CHECK_EQ(result_dim_idx, result_shape.dimensions_size());
+
+  return true;
+}
+
 IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
     HloComputation* function, string* failure_reason) const {
   CHECK_EQ(function->num_parameters(), 2);
@@ -1540,6 +1585,10 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     gtl::ArraySlice<int64> dimensions, HloComputation* function,
     string* failure_reason) {
+  if (!ReductionPreservesLayout(*reduce)) {
+    return false;
+  }
+
   ReductionGenerator reduction_generator =
       MatchReductionGenerator(function, failure_reason);
   if (!reduction_generator) {
@@ -1760,12 +1809,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 
 Status IrEmitter::HandleSend(HloInstruction* send) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSlice(HloInstruction* slice) {
@@ -1930,12 +1979,12 @@ Status IrEmitter::HandleDynamicUpdateSlice(
 
 Status IrEmitter::HandleRecv(HloInstruction* recv) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandlePad(HloInstruction* pad) {
@@ -1944,10 +1993,10 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   for (auto& padding_dimension : pad->padding_config().dimensions()) {
     if (padding_dimension.edge_padding_low() < 0 ||
         padding_dimension.edge_padding_high() < 0) {
-      return Unimplemented(
-          "Negative padding not supported in the CPU backend (b/34628603); "
-          "this should have been eliminated at the HLO level: %s",
-          pad->padding_config().ShortDebugString().c_str());
+      return InternalErrorStrCat(
+          "Encountered negative padding in IrEmitter on CPU. "
+          "This should have been eliminated at the HLO level. ",
+          pad->ToString());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index d1b88b27f068962fb86477fcad3e4390b1636c2b..cd997f07890cdc1d9a546ede58cc1d992b6416ae 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -61,9 +61,9 @@ ParallelCpuExecutable::ParallelCpuExecutable(
     std::unique_ptr<const HloInstructionMap<string>> function_names,
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
         aligned_constants,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 90ac94ef9288b2e860cb30c47ed44a7b96e4825d..c393e9b8ea39bfb4c605ebba8e2cd29726bc4af9 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -55,7 +55,7 @@ class ParallelCpuExecutable : public Executable {
       std::unordered_map<const HloInstruction*,
                          std::unique_ptr<unsigned char[]>>
           aligned_constants,
-      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 2e4b0a5230516b5308aeed892de9a49565a09f2e..78e7aa48accdbb51a8477455f5f9c004828c068f 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    perftools::gputools::Platform* platform,
+    const perftools::gputools::Platform* platform,
     tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
         stream_executors)
     : DeviceMemoryAllocator(platform),
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 00caefab667cba6abfef200050ca18f229fc0320..39dfad84c1c1c1c461c24de555ecd919cea47d83 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -49,14 +49,14 @@ class DeviceMemoryAllocator {
       int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  const perftools::gputools::Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
-  perftools::gputools::Platform* platform_;
+  const perftools::gputools::Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -64,7 +64,7 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      perftools::gputools::Platform* platform,
+      const perftools::gputools::Platform* platform,
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           stream_executors);
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 9780bac16ec17eed2c1df64f01bcb753e26b46f0..4468adbadbf823f1420a8b665a26f66cb7d36b43 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -428,7 +428,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           llvm::Intrinsic::round, {operand_value}, {operand_value->getType()},
           ir_builder_);
     case HloOpcode::kSign: {
-      // TODO(b/32151903): Ensure consistent sign behavior for -0.0
+      // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero);
@@ -870,7 +870,10 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) const {
   if (prim_type != F32) {
-    return Unimplemented("inverse erf only implemented for F32 (b/34339814)");
+    // TODO(b/34339814): Implement inverse erf for F64.
+    return Unimplemented(
+        "Inverse erf is only implemented for element "
+        "type F32.");
   }
   auto getFloat = [&](const float f) {
     return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f);
@@ -1040,17 +1043,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
           is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
           lhs_value, rhs_value, ir_builder_);
     case HloOpcode::kMinimum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMin(lhs_value, rhs_value, is_signed);
     case HloOpcode::kMaximum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMax(lhs_value, rhs_value, is_signed);
     case HloOpcode::kAnd:
       return ir_builder_->CreateAnd(lhs_value, rhs_value);
     case HloOpcode::kOr:
@@ -1067,6 +1062,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
   }
 }
 
+llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
+llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
     const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo,
     int64 operand_no) const {
@@ -1363,7 +1378,18 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
                             operand_to_generator.at(hlo->operand(2))(
                                 ElementwiseSourceIndex(index, *hlo, 2)));
-        return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        PrimitiveType prim_type = hlo->shape().element_type();
+        if (primitive_util::IsFloatingPointType(prim_type)) {
+          return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        } else if (primitive_util::IsIntegralType(prim_type)) {
+          bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
+          return EmitIntegralMin(
+              max_value, EmitIntegralMax(min_value, arg_value, is_signed),
+              is_signed);
+        } else {
+          return Unimplemented("Clamp unimplemented for %s",
+                               PrimitiveType_Name(prim_type).c_str());
+        }
       };
     case HloOpcode::kReducePrecision:
       return [this, hlo, &operand_to_generator](
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 1a48eb5fcb960b60d524ea56a43e15269576db76..c516a826d9e382bc738e54635426db639d17108c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -86,6 +86,12 @@ class ElementalIrEmitter {
   virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value,
                                     llvm::Value* rhs_value) const;
 
+  llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
+  llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
   virtual StatusOr<llvm::Value*> EmitErfInv(PrimitiveType prim_type,
                                             llvm::Value* value) const;
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 21e7fbea291721dfc446bae2a7002a8ec2520be4..90481c7a88f90edea5399ee44aee2d2c77fc115f 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -73,7 +73,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
   std::unique_ptr<HloExecutionProfile> profile_ptr =
       module_config().debug_options().xla_hlo_profile() &&
               hlo_profiling_enabled()
-          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(),
+          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer_data(),
                                             &hlo_profile_index_map())
           : nullptr;
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 5ecfdffe211c571b1bb2bc30ff2acd3021c735ae..0aee535ee780ef000bc5e9963ff48786b3a61eb2 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -44,13 +44,14 @@ namespace xla {
 // interface that is used for launching compiled programs across platforms.
 class Executable {
  public:
-  explicit Executable(std::unique_ptr<const HloModule> hlo_module,
-                      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
-                      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+  explicit Executable(
+      std::unique_ptr<const HloModule> hlo_module,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
       : hlo_module_(std::move(hlo_module)),
-        hlo_profile_printer_(std::move(hlo_profile_printer)),
+        hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
         hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
-    CHECK_EQ(hlo_profile_printer_.get() == nullptr,
+    CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
              hlo_profile_index_map_.get() == nullptr);
   }
   virtual ~Executable() {}
@@ -116,9 +117,9 @@ class Executable {
         "Equality test on this executable is not implemented.");
   }
 
-  const HloProfilePrinter& hlo_profile_printer() const {
+  const HloProfilePrinterData& hlo_profile_printer_data() const {
     CHECK(hlo_profiling_enabled());
-    return *hlo_profile_printer_;
+    return *hlo_profile_printer_data_;
   }
 
   const HloProfileIndexMap& hlo_profile_index_map() const {
@@ -129,7 +130,9 @@ class Executable {
   // Returns whether this executable was compiled with HLO profilings support
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
-  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
 
   const HloModule& module() const { return *hlo_module_; }
 
@@ -179,7 +182,7 @@ class Executable {
   // execution.
   int64 execution_count_ = 0;
 
-  std::unique_ptr<HloProfilePrinter> hlo_profile_printer_;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index df5e2e35f802b476f4d9fef2cd4816089663686f..80c2eed1097e4a7dcbf29b9b1c02fb9964983368 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -228,6 +228,7 @@ cc_library(
 cc_library(
     name = "gpu_executable",
     srcs = [
+        "conditional_thunk.cc",
         "convolution_thunk.cc",
         "copy_thunk.cc",
         "cudnn_batchnorm_thunk.cc",
@@ -243,6 +244,7 @@ cc_library(
         "while_thunk.cc",
     ],
     hdrs = [
+        "conditional_thunk.h",
         "convolution_thunk.h",
         "copy_thunk.h",
         "cudnn_batchnorm_thunk.h",
@@ -512,7 +514,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "@llvm//:core",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..790ca535b11ee47724ef6227de40726d940d6153
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace gpu {
+
+ConditionalThunk::ConditionalThunk(
+    const BufferAllocation::Slice& predicate_buffer_index,
+    const BufferAllocation::Slice& true_operand_buffer_index,
+    const BufferAllocation::Slice& false_operand_buffer_index,
+    ThunkSequence true_thunk_sequence, ThunkSequence false_thunk_sequence,
+    const HloInstruction* hlo)
+    : Thunk(Kind::kConditional, hlo),
+      predicate_buffer_index_(predicate_buffer_index),
+      true_operand_buffer_index_(true_operand_buffer_index),
+      false_operand_buffer_index_(false_operand_buffer_index),
+      true_thunk_(std::move(true_thunk_sequence), hlo),
+      false_thunk_(std::move(false_thunk_sequence), hlo) {}
+
+Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
+  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable));
+  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable));
+  return Status::OK();
+}
+
+Status ConditionalThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations,
+    perftools::gputools::Stream* stream) {
+  // Copy the predicate value from device.
+  bool predicate;
+  perftools::gputools::DeviceMemoryBase predicate_address =
+      buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
+  stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
+
+  Status block_status = stream->BlockHostUntilDone();
+  if (!block_status.ok()) {
+    return InternalError("Failed to retrieve predicate value on stream %p: %s.",
+                         stream, block_status.error_message().c_str());
+  }
+
+  // Execute the true or the false computation depending on the value of the
+  // predicate.
+  if (predicate) {
+    TF_RETURN_IF_ERROR(true_thunk_.ExecuteOnStream(buffer_allocations, stream));
+  } else {
+    TF_RETURN_IF_ERROR(
+        false_thunk_.ExecuteOnStream(buffer_allocations, stream));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..7725c46a3b4b51af34a4dd977885353ff32c21f6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// ConditionalThunk implements the conditional instruction on GPU by reading the
+// predicate of the conditional and executing the true or the false computation
+// depending on the value of the predicate.
+//
+// ConditionalThunk assumes that the buffers of the conditional result and the
+// result of the true and false computations share the same allocation. Also,
+// the buffers of the true operand of the conditional and that of the parameter
+// instruction of the true computation share the same allocation. Similarly, the
+// buffers of the false operand and that of the parameter instruction of the
+// false computation share the same allocation.
+class ConditionalThunk : public Thunk {
+ public:
+  ConditionalThunk(const BufferAllocation::Slice& predicate_buffer_index,
+                   const BufferAllocation::Slice& true_operand_buffer_index,
+                   const BufferAllocation::Slice& false_operand_buffer_index,
+                   ThunkSequence true_thunk_sequence,
+                   ThunkSequence false_thunk_sequence,
+                   const HloInstruction* hlo);
+
+  ConditionalThunk(const ConditionalThunk&) = delete;
+  ConditionalThunk& operator=(const ConditionalThunk&) = delete;
+
+  Status Initialize(const GpuExecutable& executable) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  BufferAllocation::Slice predicate_buffer_index_;
+  BufferAllocation::Slice true_operand_buffer_index_;
+  BufferAllocation::Slice false_operand_buffer_index_;
+  SequentialThunk true_thunk_;
+  SequentialThunk false_thunk_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 21798ed606fdb12e88c0d48f2c36cd9ad0c8344a..07543d42e3e47c2acb6bf908399bb88c3ae353b3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -212,7 +212,9 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
+tensorflow::Status PrepareHloModuleForIrEmitting(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
@@ -410,7 +412,8 @@ GpuCompiler::GpuCompiler()
                         .getPointerSize(0 /* default address space */)) {}
 
 StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
   Tracing::TraceMe annotation("HLO Transforms", module->name(),
                               /*is_expensive=*/true);
@@ -419,12 +422,14 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
 }
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
 
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get(), stream_exec,
+                                                   device_allocator));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -463,12 +468,12 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   // print one ourselves.
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
   XLA_VLOG_LINES(2, module->ToString());
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
-  if (!xla_dump_hlo_proto_to.empty()) {
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+  if (!xla_dump_optimized_hlo_proto_to.empty()) {
     HloProto proto = MakeHloProto(*module, *buffer_assignment);
     TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_hlo_proto_to, module->name()));
+        proto, xla_dump_optimized_hlo_proto_to, module->name()));
   }
 
   IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
@@ -593,14 +598,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
-  std::unique_ptr<HloProfilePrinter> profile_printer;
+  std::unique_ptr<HloProfilePrinterData> profile_printer;
 
   if (module->config().hlo_profiling_enabled()) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
     profile_printer =
-        CreateHloProfilePrinter(*profile_index_map, cost_analysis);
+        CreateHloProfilePrinterData(*profile_index_map, cost_analysis);
   }
 
   auto* gpu_executable = new GpuExecutable(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 18e34340205b6f51497e26c45520799d21c55a46..c352d4d8462fadb266c55ad437de998e86a6528e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -51,11 +51,13 @@ class GpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index e67087d822e2f3367c48b08be66f5f60791be638..e3b493c6630d061c00dc6c67bdaecdb2e5d68533 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -36,7 +36,7 @@ namespace gpu {
 
 StatusOr<HloInstruction*> GpuCopyInsertion::FindOrInsertCopy(
     HloInstruction* hlo) {
-  HloInstruction*& copy = inserted_copies_[hlo];
+  HloInstruction*& copy = hlo_to_copy_map_[hlo];
   if (copy == nullptr) {
     TF_ASSIGN_OR_RETURN(copy, hlo->parent()->DeepCopyInstruction(hlo));
   }
@@ -86,27 +86,34 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
     }
   }
 
-  // Init values of a while node cannot be constants. Insert copies for any
-  // constants found at the operand of a while.
-  tensorflow::gtl::FlatSet<HloInstruction*> copied_constants;
+  // Init values of while and conditional nodes cannot be constants. Insert
+  // copies for any constants found at the operands of these nodes.
+  tensorflow::gtl::FlatSet<HloInstruction*> inserted_copies;
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kWhile) {
+      if (instruction->opcode() != HloOpcode::kWhile &&
+          instruction->opcode() != HloOpcode::kConditional) {
         continue;
       }
-      for (auto& pair :
-               dataflow->GetInstructionValueSet(instruction->operand(0))) {
-        const HloValueSet& value_set = pair.second;
-        for (const HloValue* value : value_set.values()) {
-          if (value->defining_instruction()->opcode() ==
-              HloOpcode::kConstant &&
-              !ContainsKey(copied_constants, value->defining_instruction())) {
-            HloInstruction* constant = value->defining_instruction();
-            TF_ASSIGN_OR_RETURN(HloInstruction * copy,
-                                FindOrInsertCopy(constant));
-            TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
-            copied_constants.insert(constant);
-            changed = true;
+      for (auto operand : instruction->operands()) {
+        // Skip the operands that have already been replaced with a copy in a
+        // previous iteration (which is possible when a constant is used as an
+        // operand in multiple places).
+        if (ContainsKey(inserted_copies, operand)) {
+          continue;
+        }
+        for (auto& pair : dataflow->GetInstructionValueSet(operand)) {
+          const HloValueSet& value_set = pair.second;
+          for (const HloValue* value : value_set.values()) {
+            if (value->defining_instruction()->IsConstant() &&
+                !ContainsKey(hlo_to_copy_map_, value->defining_instruction())) {
+              HloInstruction* constant = value->defining_instruction();
+              TF_ASSIGN_OR_RETURN(HloInstruction * copy,
+                                  FindOrInsertCopy(constant));
+              TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
+              inserted_copies.insert(copy);
+              changed = true;
+            }
           }
         }
       }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
index 4d77f337e6eb20f7d79acc0829fde26bbe443f25..0c6f9b511f3aac5f62182273b827adcd068cd633 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
@@ -32,13 +32,13 @@ class GpuCopyInsertion : public HloPassInterface {
   StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  // Returns a copy of `hlo`. Looks in inserted_copies_ first to avoid making
+  // Returns a copy of `hlo`. Looks in hlo_to_copy_map_ first to avoid making
   // duplicate copies.
   StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
 
   // A map containing all copies inserted to materialize operands of library
   // calls. The key is the copied instruction and the value is the copy.
-  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
+  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> hlo_to_copy_map_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 51d164cdf427f9513bc340e090832a9b064b999c..f5d67b9ea9498df3f023ea9a694a63b468c5be18 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -116,9 +116,9 @@ GpuExecutable::GpuExecutable(
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::unique_ptr<const HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       ptx_(ptx),
       cubin_(cubin),
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 00da64dfade8ddb0694c0ee7ac158c9f2e15a508..b19cfd43debd0a5490495d176fa2f1fcd625da07 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,7 @@ class GpuExecutable : public Executable {
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::unique_ptr<const HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
 
   // This should be called after set_ir_module_string.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 095c3df3bfc75cae999edc7fdd800f6e399546dd..affd2ffa8edf177867637e1bfde4c3ca41d5c6f0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -615,8 +615,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
 
 Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   // TODO(b/33011107): Support cross replica sum on GPU.
-  return Unimplemented(
-      "Cross replica sum not implemented on GPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on GPU.");
 }
 
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
@@ -710,11 +709,13 @@ Status IrEmitter::HandleCustomCall(HloInstruction*) {
 }
 
 Status IrEmitter::HandleInfeed(HloInstruction*) {
-  return Unimplemented("Infeed is not supported on GPU (b/30467474).");
+  // TODO(b/30467474): Implement infeed on GPU.
+  return Unimplemented("Infeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleOutfeed(HloInstruction*) {
-  return Unimplemented("Outfeed is not supported on GPU (b/34359662).");
+  // TODO(b/34359662): Implement outfeed on GPU.
+  return Unimplemented("Outfeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleRng(HloInstruction* random) {
@@ -758,37 +759,6 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
       "to a cudnn CustomCall using CudnnBatchNormRewriter.");
 }
 
-Status IrEmitter::HandleConditional(HloInstruction* conditional) {
-  auto pred = conditional->operand(0);
-  auto true_arg = conditional->operand(1);
-  auto false_arg = conditional->operand(2);
-
-  llvm::Value* conditional_result = GetBasePointer(*conditional);
-
-  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
-      GetBasePointer(*pred),
-      llvm_ir::AsStringRef(IrName(conditional, "load_predicate_value")));
-  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
-      pred_value,
-      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
-      llvm_ir::AsStringRef(IrName(conditional, "boolean_predicate")));
-  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-      pred_cond, IrName(conditional, "if_then_else"), &ir_builder_);
-
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
-  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      *conditional->true_computation(), {GetBasePointer(*true_arg)},
-      conditional_result));
-
-  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
-  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      *conditional->false_computation(), {GetBasePointer(*false_arg)},
-      conditional_result));
-
-  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
-  return Status::OK();
-}
-
 llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
     const llvm_ir::IrArray& operand_array, int64 reduction_dimension,
     tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 39bafaa34656a35f24444dc7f3665c1250833921..3aa178410f05aef3630a4bd83b9651f6c1aac79b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -96,7 +96,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleRng(HloInstruction* random) override;
-  Status HandleConditional(HloInstruction* conditional) override;
   Status HandleBatchNormInference(HloInstruction* batch_norm) override;
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
@@ -367,6 +366,11 @@ class IrEmitterUnnested : public IrEmitter {
   std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
                                        const int64 loop_limit);
 
+  // Returns a ConditionalThunk that executes the thunk sequence for
+  // 'true_computation' or 'false_computation' depending on the value of the
+  // predicate in the given conditional instruction.
+  std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
+
   Status Postprocess(HloInstruction* hlo) override;
 
   // Returns the last generated thunk.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index be35351e8727ce15998460e41f21a53ebe427c3b..bd428f802846b2306635c513211ec647ad73c89d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
@@ -272,8 +273,8 @@ Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
 }
 
 Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
-  thunk_sequence_->push_back(BuildKernelThunk(conditional));
-  return IrEmitter::HandleConditional(conditional);
+  thunk_sequence_->emplace_back(BuildConditionalThunk(conditional));
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
@@ -1657,8 +1658,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on GPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter not implemented on GPU.");
   }
 
   // kSelectAndScatter is implemented as two kernel launches: the first launch
@@ -2102,6 +2102,24 @@ Status IrEmitterUnnested::EmitInitializer(const HloInstruction* hlo,
 
 namespace {
 
+// Checks that the buffers corresponding to the given two HLOs share the same
+// allocation.
+Status CheckHloBuffersShareAllocation(
+    const HloInstruction* a, const HloInstruction* b, const ShapeIndex& index,
+    const BufferAssignment& buffer_assignment) {
+  const BufferAllocation::Slice slice_a =
+      buffer_assignment.GetUniqueSlice(a, index).ConsumeValueOrDie();
+  const BufferAllocation::Slice slice_b =
+      buffer_assignment.GetUniqueSlice(b, index).ConsumeValueOrDie();
+  if (slice_a != slice_b) {
+    return InternalError(
+        "instruction %s %s does not share allocation with instruction %s %s",
+        a->ToString().c_str(), slice_a.ToString().c_str(),
+        b->ToString().c_str(), slice_b.ToString().c_str());
+  }
+  return Status::OK();
+}
+
 // Checks that all buffers used during while loop iteration share the same
 // buffer allocation. This includes buffers for while result, while init
 // operand, condition parameter, body parameter and body result.
@@ -2111,37 +2129,65 @@ Status CheckWhileBuffersShareAllocation(
     const BufferAssignment& buffer_assignment) {
   return ShapeUtil::ForEachSubshapeWithStatus(
       xla_while->shape(),
-      [&buffer_assignment, &xla_while](const Shape& /*subshape*/,
-                                       const ShapeIndex& index) -> Status {
-        auto check = [&buffer_assignment](const HloInstruction* a,
-                                          const HloInstruction* b,
-                                          const ShapeIndex& index) -> Status {
-          const BufferAllocation::Slice slice_a =
-              buffer_assignment.GetUniqueSlice(a, index).ConsumeValueOrDie();
-          const BufferAllocation::Slice slice_b =
-              buffer_assignment.GetUniqueSlice(b, index).ConsumeValueOrDie();
-          if (slice_a != slice_b) {
-            return InternalError(
-                "instruction %s %s does not share allocation with "
-                "instruction %s %s",
-                a->ToString().c_str(), slice_a.ToString().c_str(),
-                b->ToString().c_str(), slice_b.ToString().c_str());
-          }
-          return Status::OK();
-        };
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
         const HloInstruction* condition_parameter =
             xla_while->while_condition()->parameter_instruction(0);
         const HloComputation* body = xla_while->while_body();
         const HloInstruction* body_parameter = body->parameter_instruction(0);
         const HloInstruction* body_result = body->root_instruction();
-        TF_RETURN_IF_ERROR(check(xla_while, xla_while->operand(0), index));
-        TF_RETURN_IF_ERROR(check(xla_while, condition_parameter, index));
-        TF_RETURN_IF_ERROR(check(xla_while, body_parameter, index));
-        TF_RETURN_IF_ERROR(check(xla_while, body_result, index));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, xla_while->operand(0), index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, condition_parameter, index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, body_parameter, index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, body_result, index, buffer_assignment));
         return Status::OK();
       });
 }
 
+// Checks that the buffers used in a conditional instruction are shared with the
+// operands and result as follows:
+//   * The result buffer of the conditional should share the allocation with the
+//     result buffers of the true and false computations.
+//   * The buffer of operand 1 should share the allocation with the buffer of
+//     the parameter 0 instruction of the true computation.
+//   * The buffer of operand 2 should share the allocation with the buffer of
+//     the parameter 0 instruction of the false computation.
+Status CheckConditionalBuffersShareAllocation(
+    const HloInstruction* conditional,
+    const BufferAssignment& buffer_assignment) {
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            conditional, conditional->true_computation()->root_instruction(),
+            index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            conditional, conditional->false_computation()->root_instruction(),
+            index, buffer_assignment));
+        return Status::OK();
+      }));
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->operand(1)->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        return CheckHloBuffersShareAllocation(
+            conditional->operand(1),
+            conditional->true_computation()->parameter_instruction(0), index,
+            buffer_assignment);
+      }));
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->operand(2)->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        return CheckHloBuffersShareAllocation(
+            conditional->operand(2),
+            conditional->false_computation()->parameter_instruction(0), index,
+            buffer_assignment);
+      }));
+  return Status::OK();
+}
+
 }  // namespace
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
@@ -2184,6 +2230,31 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
                               ir_emitter_body.ConsumeThunkSequence(), hlo);
 }
 
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
+    const HloInstruction* hlo) {
+  // Check that the buffers used in conditional are shared with the operands and
+  // result appropriately.
+  TF_CHECK_OK(CheckConditionalBuffersShareAllocation(
+      hlo, ir_emitter_context_->buffer_assignment()));
+
+  HloComputation* true_computation = hlo->true_computation();
+  IrEmitterUnnested ir_emitter_true(hlo_module_config_, true_computation,
+                                    ir_emitter_context_);
+  TF_CHECK_OK(true_computation->root_instruction()->Accept(&ir_emitter_true));
+
+  HloComputation* false_computation = hlo->false_computation();
+  IrEmitterUnnested ir_emitter_false(hlo_module_config_, false_computation,
+                                     ir_emitter_context_);
+  TF_CHECK_OK(false_computation->root_instruction()->Accept(&ir_emitter_false));
+
+  return MakeUnique<ConditionalThunk>(
+      GetAllocationSlice(*hlo->operand(0)),
+      GetAllocationSlice(*hlo->operand(1)),
+      GetAllocationSlice(*hlo->operand(2)),
+      std::move(*ir_emitter_true.ConsumeThunkSequence()),
+      std::move(*ir_emitter_false.ConsumeThunkSequence()), hlo);
+}
+
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 625c3f8bea418b7942145a05ba42b9ea9b14543b..2c3032d79be221e8cacb178ffb1817459b603cc0 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -41,6 +41,7 @@ class GpuExecutable;
 class Thunk {
  public:
   enum class Kind {
+    kConditional,
     kConvolution,
     kCopy,
     kCudnnBatchNormBackward,
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 34e2f7ee206c6a74073d8f4e867e862feb4aff49..cde5877e29f36abc61c5417ce960e2c7699e2749 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -64,10 +64,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
-                     &module_sequence);
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options, &module_sequence);
   const HloComputation* entry_computation = module.entry_computation();
   const std::vector<const HloInstruction*>& instruction_sequence =
       FindOrDie(module_sequence, entry_computation);
@@ -81,9 +79,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*module_sequence=*/nullptr);
   TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
                                          points_to_analysis));
@@ -199,15 +196,17 @@ Status HeapSimulator::RunComputation(
       // We can only share with the operand buffer if it is about to be freed;
       // we must be the last user of the buffer.
       bool shared = false;
-      for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
-        if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
-            buffer->instruction()->opcode() != HloOpcode::kCopy &&
-            CanShareOperandBufferWithUser(
-                operand_buffer->instruction(), operand_buffer->index(),
-                buffer->instruction(), buffer->index(), points_to_analysis)) {
-          ShareBuffer(buffer, operand_buffer, instruction);
-          shared = true;
-          break;
+      if (options_.may_reuse_operand_buffers) {
+        for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
+          if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
+              buffer->instruction()->opcode() != HloOpcode::kCopy &&
+              CanShareOperandBufferWithUser(
+                  operand_buffer->instruction(), operand_buffer->index(),
+                  buffer->instruction(), buffer->index(), points_to_analysis)) {
+            ShareBuffer(buffer, operand_buffer, instruction);
+            shared = true;
+            break;
+          }
         }
       }
 
@@ -266,13 +265,12 @@ Status HeapSimulator::RunComputation(
 
 HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options,
     const SequentialHloOrdering::HloModuleSequence* module_sequence)
     : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
-      buffers_to_assign_(buffers_to_assign),
+      options_(options),
       module_sequence_(module_sequence) {
   debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr);
 }
@@ -280,13 +278,16 @@ HeapSimulator::HeapSimulator(
 HeapSimulator::~HeapSimulator() {}
 
 bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
-  // Buffers for constants are ignored, as with BufferAssigner.  Also ignore
-  // buffers that we're not meant to assign.
+  // Buffers for constants are ignored unless the alloc_constants option is
+  // set. Also ignore buffers that we're not meant to assign.
   //
   // TODO(b/32248867): For consistency, constants should get allocations.
-  return buffer->instruction()->opcode() == HloOpcode::kConstant ||
-         (buffers_to_assign_ != nullptr &&
-          buffers_to_assign_->count(buffer) == 0);
+  if (!options_.alloc_constants &&
+      buffer->instruction()->opcode() == HloOpcode::kConstant) {
+    return true;
+  }
+  return options_.buffers_to_assign != nullptr &&
+         options_.buffers_to_assign->count(buffer) == 0;
 }
 
 // Alloc always calls the underlying heap algorithm.
@@ -400,8 +401,8 @@ HeapSimulator::Result HeapSimulator::Finish() {
     }
     // If we were told to assign specific buffers, make sure we've assigned
     // exactly that many buffers.
-    if (buffers_to_assign_ != nullptr) {
-      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    if (options_.buffers_to_assign != nullptr) {
+      CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 88a8698d16132372fc8f4e87eba3b99125aab876..636f19dd39f09721bd82fc4b44785f196f281ad7 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -67,6 +67,23 @@ class HeapSimulator {
     HeapSimulatorTrace debug_trace;
   };
 
+  // The different options to be passed to the Run() APIs.
+  struct Options {
+    Options()
+        : may_reuse_operand_buffers(true),
+          alloc_constants(false),
+          buffers_to_assign(nullptr) {}
+
+    // Whether a buffer about to be Free()-ed, can be recycled for a new born
+    // one, hence collapsing Free()+Alloc() calls (default true).
+    bool may_reuse_operand_buffers;
+    // Whether to issue Alloc() and Free() calls for constants (default false).
+    bool alloc_constants;
+    // If 'buffers_to_assign' is provided, only those buffers are assigned
+    // offsets, otherwise all buffers defined by the instructions are assigned.
+    const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign;
+  };
+
   // Run the heap simulation with the given algorithm, assuming the given
   // module_sequence, which must contain a topologically-consistent total
   // ordering of all instructions within each computation. The result is invalid
@@ -76,15 +93,12 @@ class HeapSimulator {
   // to running on a per-computation basis, since we can re-use buffer space for
   // called sub-computations.
   //
-  // If 'buffers_to_assign' is provided, only those buffers are assigned
-  // offsets, otherwise all buffers defined by the instructions are assigned.
   static StatusOr<Result> Run(
       std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
       const SequentialHloOrdering::HloModuleSequence& module_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
   // must contain a topologically-consistent total ordering of all instructions
@@ -96,8 +110,7 @@ class HeapSimulator {
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
  private:
   // If 'module_sequence' is non-null, it is used to find kCall and kWhile
@@ -105,8 +118,7 @@ class HeapSimulator {
   // be run recursively. I.e. the simulation is run over the whole module.
   HeapSimulator(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign,
+      const LogicalBuffer::SizeFunction& size_fn, const Options& options,
       const SequentialHloOrdering::HloModuleSequence* module_sequence);
   ~HeapSimulator();
 
@@ -130,7 +142,7 @@ class HeapSimulator {
   const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
   const std::unique_ptr<HeapAlgorithm> algorithm_;
   const LogicalBuffer::SizeFunction size_fn_;
-  const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign_;
+  const Options options_;
   const SequentialHloOrdering::HloModuleSequence* module_sequence_;
 
   // In addition to Alloc and Free, the heap simulator exposes a concept of
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 2112cf57c727c085164b104c2217a035471d398b..ab604064d5e34a1e21f3209cc4d27fc05a9ca057 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -40,9 +40,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -166,6 +168,34 @@ StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
   return std::move(result);
 }
 
+// For one particular placement of a window in a base shape (the placement is
+// represented as `window_count_index`), iterates inside the window. Translates
+// the window index into base index. If the base index is within bound, call `f`
+// with the base index.
+void IterateThroughWindow(
+    const Shape& window_shape, const Window& window, const Shape& base_shape,
+    const tensorflow::gtl::ArraySlice<int64>& window_count_index,
+    const std::function<void(const std::vector<int64>&)>& f) {
+  const int64 rank = ShapeUtil::Rank(base_shape);
+  DimensionVector window_index(rank);
+  std::fill(window_index.begin(), window_index.end(), 0);
+  do {
+    std::vector<int64> base_index(rank);
+    bool out_of_bound = false;
+    for (int64 i = 0; i < rank; ++i) {
+      base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
+                      window_index[i] - window.dimensions(i).padding_low();
+      if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
+        out_of_bound = true;
+        break;
+      }
+    }
+    if (!out_of_bound) {
+      f(base_index);
+    }
+  } while (IndexUtil::BumpIndices(window_shape, &window_index));
+}
+
 }  // namespace
 
 template <typename ReturnT, typename ElementwiseT>
@@ -1420,6 +1450,111 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
+    auto operand = select_and_scatter->operand(0);
+    auto source = select_and_scatter->operand(1);
+    const Window& window = select_and_scatter->window();
+
+    const Literal& init_literal =
+        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    auto result = Literal::CreateFromShape(select_and_scatter->shape());
+
+    // Initialize result array with the init value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+          return init_scalar;
+        }));
+
+    std::vector<int64> window_dimension_sizes;
+    for (const auto& window_dimension : window.dimensions()) {
+      window_dimension_sizes.push_back(window_dimension.size());
+    }
+    const Shape window_shape = ShapeUtil::MakeShape(
+        operand->shape().element_type(), window_dimension_sizes);
+
+    HloComputation* select = select_and_scatter->select();
+    HloComputation* scatter = select_and_scatter->scatter();
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
+
+    int64 rank = ShapeUtil::Rank(operand_literal.shape());
+
+    HloEvaluator embedded_evaluator;
+    DimensionVector source_index(rank);
+
+    std::fill(source_index.begin(), source_index.end(), 0);
+    do {
+      // For each element in `source`, we place a window in `operand`. For each
+      // window placement, we iterate inside the window twice:
+      //
+      // 1. Find the selected index by applying `select` function to all
+      // elements. E.g., If the `select` function is GreaterEqual, the first
+      // iteration through the window finds the biggest value and returns its
+      // index.
+      //
+      // 2. Using the selected index, scatter value from `source` to result. We
+      // do this by iterating through the window, and compare each index with
+      // the selected index.
+      tensorflow::gtl::optional<ReturnT> selected_val;
+      tensorflow::gtl::optional<std::vector<int64>> selected_index;
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+            if (!selected_val) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            const auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
+            const auto selected_val_literal =
+                Literal::CreateR0<ReturnT>(*selected_val);
+
+            const std::vector<const Literal*> args = {
+                curr_val_literal.get(), selected_val_literal.get()};
+            std::unique_ptr<Literal> computed_result =
+                embedded_evaluator.Evaluate<const Literal*>(*select, args)
+                    .ConsumeValueOrDie();
+            bool selected = computed_result->Get<bool>({});
+            if (selected) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            embedded_evaluator.ResetVisitStates();
+          });
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            if (std::equal(operand_index.begin(), operand_index.end(),
+                           selected_index->begin())) {
+              auto source = source_literal.Get<ReturnT>(source_index);
+              auto scattered = result->Get<ReturnT>(operand_index);
+              const auto source_literal = Literal::CreateR0<ReturnT>(source);
+              const auto scattered_literal =
+                  Literal::CreateR0<ReturnT>(scattered);
+
+              const std::vector<const Literal*> args = {
+                  source_literal.get(), scattered_literal.get()};
+              std::unique_ptr<Literal> computed_result =
+                  embedded_evaluator.Evaluate<const Literal*>(*scatter, args)
+                      .ConsumeValueOrDie();
+              result->Set(operand_index, computed_result->Get<ReturnT>({}));
+              // Clear visit states so that the we can use the evaluator again
+              // on the same computation.
+              embedded_evaluator.ResetVisitStates();
+            }
+          });
+    } while (IndexUtil::BumpIndices(source->shape(), &source_index));
+
+    parent_->evaluated_[select_and_scatter] = std::move(result);
+    return Status::OK();
+  }
+
   Status HandleReduceWindow(HloInstruction* reduce_window) override {
     auto operand = reduce_window->operand(0);
     const Window& window = reduce_window->window();
@@ -1468,39 +1603,28 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           std::fill(window_index.begin(), window_index.end(), 0);
           std::fill(operand_index.begin(), operand_index.end(), 0);
 
-          do {
-            bool out_of_bound = false;
-            for (int i = 0; i < operand_index.size(); ++i) {
-              operand_index[i] =
-                  output_index[i] * window.dimensions(i).stride() +
-                  window_index[i] - window.dimensions(i).padding_low();
-              if (operand_index[i] < 0 ||
-                  operand_index[i] >= operand_literal.shape().dimensions(i)) {
-                out_of_bound = true;
-                break;
-              }
-            }
-            if (!out_of_bound) {
-              auto curr_val = operand_literal.Get<ReturnT>(operand_index);
-
-              // Evaluate computation with specified literal operands.
-              const auto curr_val_literal =
-                  Literal::CreateR0<ReturnT>(curr_val);
-              const auto result_val_literal =
-                  Literal::CreateR0<ReturnT>(result_val);
-              const std::vector<const Literal*> args = {
-                  curr_val_literal.get(), result_val_literal.get()};
-              std::unique_ptr<Literal> computed_result =
-                  embedded_evaluator.Evaluate<const Literal*>(*function, args)
-                      .ConsumeValueOrDie();
-
-              // Clear visit states so that the we can use the evaluate again on
-              // the same computation.
-              embedded_evaluator.ResetVisitStates();
-
-              result_val = computed_result->Get<ReturnT>({});
-            }
-          } while (IndexUtil::BumpIndices(window_shape, &window_index));
+          IterateThroughWindow(
+              window_shape, window, operand_literal.shape(), output_index,
+              [&](const std::vector<int64>& operand_index) {
+                auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+
+                // Evaluate computation with specified literal operands.
+                const auto curr_val_literal =
+                    Literal::CreateR0<ReturnT>(curr_val);
+                const auto result_val_literal =
+                    Literal::CreateR0<ReturnT>(result_val);
+                const std::vector<const Literal*> args = {
+                    curr_val_literal.get(), result_val_literal.get()};
+                std::unique_ptr<Literal> computed_result =
+                    embedded_evaluator.Evaluate<const Literal*>(*function, args)
+                        .ConsumeValueOrDie();
+
+                // Clear visit states so that the we can use the evaluate again
+                // on the same computation.
+                embedded_evaluator.ResetVisitStates();
+
+                result_val = computed_result->Get<ReturnT>({});
+              });
 
           return result_val;
         }));
@@ -1583,6 +1707,115 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleCos<ElementwiseT>(cos);
   }
 
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  float, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[reduce_precision],
+        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
+                                                 ElementwiseT elem) {
+          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
+          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
+          const uint32_t exponent_bits = reduce_precision->exponent_bits();
+
+          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
+          //
+          // Bits in float type:
+          //   mantissa : bits [0:22]
+          //   exponent : bits [23:30]
+          //   sign     : bits [31]
+          if (mantissa_bits < 23) {
+            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+            // Compute rounding bias for round-to-nearest with ties to even.
+            // This is equal to a base value of 0111... plus one bit if the last
+            // remaining mantissa bit is 1.
+            const uint32_t base_rounding_bias =
+                (last_mantissa_bit_mask >> 1) - 1;
+            const uint32_t x_last_mantissa_bit =
+                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
+            const uint32_t x_rounding_bias =
+                x_last_mantissa_bit + base_rounding_bias;
+
+            // Add rounding bias, and mask out truncated bits.  Note that the
+            // case where adding the rounding bias overflows into the exponent
+            // bits is correct; the non-masked mantissa bits will all be zero,
+            // and the exponent will be incremented by one.
+            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+            value_as_int = value_as_int + x_rounding_bias;
+            value_as_int = value_as_int & truncation_mask;
+          }
+          if (exponent_bits < 8) {
+            // Masks for f32 values.
+            const uint32_t f32_sign_bit_mask = 1u << 31;
+            const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
+            // most- significant bit -- is equal to 1.0f for all exponent sizes.
+            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
+            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
+            // this gives us the lowest' exponent (corresponding to 0.0f).
+            //
+            // Thus, the f32 exponent corresponding to the highest non-infinite
+            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+            // exponent corresponding to the lowest exponent for a bit size of n
+            // is (2^7-1) - 2^(n-1)-1.
+            //
+            // Note that we have already checked that exponents_bits >= 1.
+            const uint32_t f32_exponent_bias = (1 << 7) - 1;
+            const uint32_t reduced_exponent_bias =
+                (1 << (exponent_bits - 1)) - 1;
+            const uint32_t reduced_max_exponent =
+                f32_exponent_bias + reduced_exponent_bias;
+            const uint32_t reduced_min_exponent =
+                f32_exponent_bias - reduced_exponent_bias;
+
+            // Do we overflow or underflow?
+            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
+            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
+            const bool x_underflows =
+                x_exponent <= (reduced_min_exponent << 23);
+
+            // Compute appropriately-signed values of zero and infinity.
+            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
+            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
+
+            // Force to zero or infinity if overflow or underflow.  (Note that
+            // this truncates all denormal values to zero, rather than rounding
+            // them.)
+            value_as_int = x_overflows ? x_signed_inf : value_as_int;
+            value_as_int = x_underflows ? x_signed_zero : value_as_int;
+          }
+
+          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
+          if (std::isnan(elem)) {
+            reduced_result = mantissa_bits > 0
+                                 ? elem
+                                 : std::numeric_limits<float>::infinity();
+          }
+          return reduced_result;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  double, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Double not supported for reduce precision");
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Unsupported type for reduce precision");
+  }
+
+  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
+    return HandleReducePrecision<ElementwiseT>(reduce_precision);
+  }
+
  private:
   template <typename IndexT>
   StatusOr<std::unique_ptr<Literal>> DynamicSlice(
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 849aac0b12b096e5f7c4a5c441fc019c48a27060..f0df93b61d29c1535d8a89fbd65e669de5b43729 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -40,83 +40,75 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
   }
 }
 
-std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis) {
-  using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
-  using HloInstructionInfo = HloProfilePrinter::HloInstructionInfo;
-
-  HloComputationInfo* computation_infos =
-      new HloComputationInfo[hlo_profile_index_map.computation_count()];
-
-  // There are two "indices" in play here.  The first one is the index of the
-  // HloComputationInfo or HloInstructionInfo in the array that contains said
-  // HloComputationInfo or HloInstructionInfo.  The second index is the index of
-  // the HloComputationInfo or HloInstructionInfo in the profile counters array,
-  // as decided by hlo_profile_index_map.  The latter index is always referred
-  // to as "profile_index".
-
-  size_t computation_index_in_static_data = 0;
-  size_t max_profile_index = hlo_profile_index_map.total_count();
-  for (const auto& pair : hlo_profile_index_map.computation_to_profile_idx()) {
-    CHECK_LT(pair.second, max_profile_index);
+  using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
+  using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
+
+  size_t profile_counters_size = hlo_profile_index_map.total_count();
+
+  std::unique_ptr<HloProfilePrinterData> profile_printer_data =
+      MakeUnique<HloProfilePrinterData>();
+  profile_printer_data->set_profile_counters_size(profile_counters_size);
+  profile_printer_data->mutable_computation_infos()->Reserve(
+      hlo_profile_index_map.computation_count());
+
+  const auto& computation_to_profile_idx_map =
+      hlo_profile_index_map.computation_to_profile_idx();
+
+  // computation_to_profile_idx_map's order is not deterministic so create a
+  // deterministic computation_and_profile_idx_list so that we end up with a
+  // deterministic HloProfilePrinterData protobuf.
+
+  std::vector<std::pair<const HloComputation*, int64>>
+      computation_and_profile_idx_list(computation_to_profile_idx_map.begin(),
+                                       computation_to_profile_idx_map.end());
+
+  // The profile indices were computed deterministically in
+  // HloProfileIndexMap::HloProfileIndexMap.
+  c_sort(computation_and_profile_idx_list,
+         [](const std::pair<const HloComputation*, int64>& left,
+            const std::pair<const HloComputation*, int64>& right) {
+           return left.second < right.second;
+         });
+
+  for (const auto& pair : computation_and_profile_idx_list) {
+    CHECK_LT(pair.second, profile_counters_size);
     const HloComputation* computation = pair.first;
-    size_t current_computation_index = computation_index_in_static_data++;
     HloComputationInfo* computation_info =
-        &computation_infos[current_computation_index];
+        profile_printer_data->add_computation_infos();
 
-    computation_info->name = strdup(computation->name().c_str());
-    computation_info->profile_index = pair.second;
-    computation_info->instructions =
-        new HloInstructionInfo[computation->instruction_count()];
-    computation_info->instructions_size = computation->instruction_count();
+    computation_info->set_name(computation->name());
+    computation_info->set_profile_index(pair.second);
+    computation_info->mutable_instruction_infos()->Reserve(
+        computation->instruction_count());
 
-    size_t instruction_index_in_static_data = 0;
     for (const HloInstruction* hlo : computation->instructions()) {
-      HloProfilePrinter::HloInstructionInfo* instruction_info =
-          &computation_info->instructions[instruction_index_in_static_data++];
-      instruction_info->long_name = strdup(hlo->ToString().c_str());
-      instruction_info->short_name = strdup(
-          hlo->ToString(HloPrintOptions().set_compact_operands(true)).c_str());
-      instruction_info->category = strdup(hlo->ToCategory().c_str());
-      instruction_info->flop_count = cost_analysis.flop_count(*hlo);
-      instruction_info->transcendental_count =
-          cost_analysis.transcendental_count(*hlo);
-      instruction_info->bytes_accessed = cost_analysis.bytes_accessed(*hlo);
-      instruction_info->optimal_seconds = cost_analysis.optimal_seconds(*hlo);
-      instruction_info->profile_index =
-          hlo_profile_index_map.GetProfileIndexFor(*hlo);
-      CHECK_LT(instruction_info->profile_index, max_profile_index);
+      HloInstructionInfo* instruction_info =
+          computation_info->add_instruction_infos();
+      instruction_info->set_long_name(hlo->ToString());
+      instruction_info->set_short_name(
+          hlo->ToString(HloPrintOptions().set_compact_operands(true)));
+      instruction_info->set_category(hlo->ToCategory());
+      instruction_info->set_flop_count(cost_analysis.flop_count(*hlo));
+      instruction_info->set_transcendental_count(
+          cost_analysis.transcendental_count(*hlo));
+      instruction_info->set_bytes_accessed(cost_analysis.bytes_accessed(*hlo));
+      instruction_info->set_optimal_seconds(
+          cost_analysis.optimal_seconds(*hlo));
+      instruction_info->set_profile_index(
+          hlo_profile_index_map.GetProfileIndexFor(*hlo));
     }
   }
 
-  auto deleter = [](HloProfilePrinter::HloComputationInfo* computation_infos,
-                    int64 computation_infos_size) {
-    for (int64 i = 0; i < computation_infos_size; i++) {
-      HloInstructionInfo* instruction_infos = computation_infos[i].instructions;
-      for (int64 j = 0; j < computation_infos[i].instructions_size; j++) {
-        // We can't make instruction_infos[j].long_name etc. non-const pointers
-        // since they may point into static storage, so we have a const_cast
-        // here.
-        free(const_cast<char*>(instruction_infos[j].long_name));
-        free(const_cast<char*>(instruction_infos[j].short_name));
-        free(const_cast<char*>(instruction_infos[j].category));
-      }
-      delete[] instruction_infos;
-      free(const_cast<char*>(computation_infos[i].name));
-    }
-    delete[] computation_infos;
-  };
-
-  return MakeUnique<HloProfilePrinter>(
-      computation_infos, hlo_profile_index_map.computation_count(),
-      /*profile_counters_size=*/max_profile_index, deleter);
+  return profile_printer_data;
 }
 
 HloExecutionProfile::HloExecutionProfile(
-    const HloProfilePrinter* hlo_profile_printer,
+    const HloProfilePrinterData* hlo_profile_printer_data,
     const HloProfileIndexMap* hlo_profile_index_map)
-    : hlo_profile_printer_(*hlo_profile_printer),
+    : hlo_profile_printer_data_(*hlo_profile_printer_data),
       hlo_profile_index_map_(*hlo_profile_index_map),
       profile_counters_(
           /*count*/ hlo_profile_index_map_.total_count(),
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 1a6b069609cb58bcc9659b4457453758a277bc0e..6fb91b9bef9d1df82b8806ce79cc147823edeb3d 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -77,8 +77,8 @@ class HloProfileIndexMap {
   std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
 };
 
-// Create an instance of `HloProfilePrinter` that owns its memory.
-std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+// Create an instance of `HloProfilePrinterData`.
+std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis);
 
@@ -90,7 +90,7 @@ class HloExecutionProfile {
  public:
   using DeviceDescription = perftools::gputools::DeviceDescription;
 
-  HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer,
+  HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
                       const HloProfileIndexMap* hlo_profile_index_map);
 
   // Record how many cycles this HLO took to execute.
@@ -117,11 +117,10 @@ class HloExecutionProfile {
   // debugging; e.g. emits cycle counts, execution time at the nominal device
   // frequency, and the effective throughput given the provided cost_analysis
   // for the operations in a given computation. Returns an empty string if it
-  // wasn't possible to generate a printable version. cost_analysis should be a
-  // clean analysis that can be used to visit the computation.
+  // wasn't possible to generate a printable version.
   string ToString(const DeviceDescription& device_description) const {
-    return hlo_profile_printer_.ToString(profile_counters_.data(),
-                                         device_description.clock_rate_ghz());
+    return PrintHloProfile(hlo_profile_printer_data_, profile_counters_.data(),
+                           device_description.clock_rate_ghz());
   }
 
   std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
@@ -130,7 +129,7 @@ class HloExecutionProfile {
   }
 
  private:
-  const HloProfilePrinter& hlo_profile_printer_;
+  const HloProfilePrinterData& hlo_profile_printer_data_;
   const HloProfileIndexMap& hlo_profile_index_map_;
 
   // Stores per-Hlo profile counters.  This is the only thing that changes when
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index b1e6729e2bccad4bdbe075a635d8a9b1ede6fecb..a0cb28246d3be541e798e85552436f64a3521f22 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -73,8 +73,8 @@ TEST_F(HloExecutionProfileTest, Basic) {
 
   HloCostAnalysis cost_analysis(shape_size_function);
   HloProfileIndexMap profile_index_map(*hlo_module);
-  std::unique_ptr<HloProfilePrinter> profile_printer =
-      CreateHloProfilePrinter(profile_index_map, cost_analysis);
+  std::unique_ptr<HloProfilePrinterData> profile_printer =
+      CreateHloProfilePrinterData(profile_index_map, cost_analysis);
   HloExecutionProfile execution_profile(profile_printer.get(),
                                         &profile_index_map);
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index f7c6435002d278d93cc0814041a7e055e5573e3e..44fcd36370dcd0cf77601aa1cd2b92810947bd5f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1063,14 +1063,19 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   // node -- there the shape and layout is present in the output node.
   if (instr->opcode() != HloOpcode::kFusion ||
       !ShouldShowFusionSubcomputation(instr)) {
-    string instr_shape = ShapeUtil::HumanString(instr->shape());
-
-    // Show layout of non-tuple shapes with more than one dimension.
-    if (LayoutUtil::HasLayout(instr->shape()) &&
-        instr->shape().dimensions_size() > 1 &&
-        !ShapeUtil::IsTuple(instr->shape())) {
-      StrAppend(&instr_shape, "{",
-                Join(LayoutUtil::MinorToMajor(instr->shape()), ","), "}");
+    // Show layout of instructions with more than one dimension.  Don't show
+    // layout on tuples or tensors with just one dimension (which only have one
+    // possible layout) to avoid visual noise.
+    bool shape_is_multidim = false;
+    ShapeUtil::ForEachSubshape(instr->shape(),
+                               [&](const Shape& s, const ShapeIndex&) {
+                                 shape_is_multidim |= s.dimensions_size() > 1;
+                               });
+    string instr_shape;
+    if (instr->opcode() != HloOpcode::kTuple && shape_is_multidim) {
+      instr_shape = ShapeUtil::HumanStringWithLayout(instr->shape());
+    } else {
+      instr_shape = ShapeUtil::HumanString(instr->shape());
     }
 
     // Some instructions have giant tuples as their shapes, so truncate the
@@ -1421,9 +1426,11 @@ void DumpText(const HloModule& module, const string& label,
 
 string MaybeDumpHloModule(const HloModule& module, const string& label,
                           const HloExecutionProfile* profile) {
-  VLOG(2) << "MaybeDumpHloModule called on module " << module.name();
-  string graph_url;
   const DebugOptions& debug_options = module.config().debug_options();
+  VLOG(2) << "MaybeDumpHloModule called on module " << module.name()
+          << " with generate_hlo_graph regex \""
+          << debug_options.xla_generate_hlo_graph() << "\"";
+  string graph_url;
   if (!debug_options.xla_generate_hlo_graph().empty() &&
       RE2::PartialMatch(module.name(),
                         debug_options.xla_generate_hlo_graph())) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 4255d6086625dfb9a045e4431e968a5ee0106ac7..bc74c4bc10cad20eab20b5caf8550b17048a5276 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -102,6 +102,36 @@ bool HloGetTupleElementMatcher::MatchAndExplain(
   return true;
 }
 
+void HloCustomCallMatcher::DescribeTo(std::ostream* os) const {
+  HloMatcher::DescribeTo(os);
+  *os << " with call target that ";
+  call_target_matcher_.DescribeTo(os);
+}
+
+bool HloCustomCallMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  ::testing::StringMatchResultListener sub_listener;
+  bool result = ExplainMatchResult(
+      call_target_matcher_, instruction->custom_call_target(), &sub_listener);
+  if (sub_listener.str().empty()) {
+    sub_listener << " that ";
+
+    std::stringstream desc_stream;
+    if (result) {
+      call_target_matcher_.DescribeTo(&desc_stream);
+    } else {
+      call_target_matcher_.DescribeNegationTo(&desc_stream);
+    }
+    sub_listener << desc_stream.str();
+  }
+  *listener << "custom-call with call target" << sub_listener.str();
+  return result;
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 9206cdac05fbc1d6051617ab4b0f3016f19e3c90..103f04a2cb7a1a5ae877d8bf259692f7cbed3408 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -56,8 +56,8 @@ class HloParameterMatcher : public HloMatcher {
 // index to match.
 class HloGetTupleElementMatcher : public HloMatcher {
  public:
-  explicit HloGetTupleElementMatcher(
-      ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index)
+  HloGetTupleElementMatcher(::testing::Matcher<const HloInstruction*> operand,
+                            int64 tuple_index)
       : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
         tuple_index_(tuple_index) {}
 
@@ -68,6 +68,24 @@ class HloGetTupleElementMatcher : public HloMatcher {
   int64 tuple_index_;
 };
 
+// Custom matcher for custom-call instructions, which accepts a matcher for its
+// call target.
+class HloCustomCallMatcher : public HloMatcher {
+ public:
+  HloCustomCallMatcher(
+      ::testing::Matcher<string> call_target_matcher,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCustomCall, operands),
+        call_target_matcher_(call_target_matcher) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  ::testing::Matcher<string> call_target_matcher_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -94,7 +112,6 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
-HLO_MATCHER(CustomCall);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
@@ -184,6 +201,36 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
       new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
 }
 
+// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call
+//   target T and the given operands.
+//
+// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the
+//   given operands.
+//
+// - CustomCall() matches any CustomCall HLO at all.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    ::testing::Matcher<string> call_target_matcher, M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher(
+      call_target_matcher, {operands...}));
+}
+// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to
+// ::testing::Matcher<string>.  In that case, we want to prefer the overload
+// above.
+template <typename FirstM, typename... M,
+          typename Dummy = typename std::enable_if<
+              !std::is_convertible<FirstM, ::testing::Matcher<string>>::value,
+              void>::type*>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    FirstM operands_first, M... operands_rest) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      HloOpcode::kCustomCall, {operands_first, operands_rest...}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1465d1cacdc971a04c620bc48bed33239a67a955..1c21703a45e11914854153bc14fabd85e9ea57f2 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -23,6 +23,12 @@ using ::testing::Eq;
 namespace xla {
 namespace {
 
+string DescribeHloMatcher(const ::testing::Matcher<const HloInstruction*>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
 template <typename M, typename T>
 string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
@@ -67,5 +73,32 @@ TEST(HloMatchersTest, Test) {
          "add"));
 }
 
+TEST(HloMatchersTest, CustomCallMatcher) {
+  auto c1 = HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3}));
+  auto c2 = HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 3}));
+  auto call = HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target");
+
+  EXPECT_THAT(call.get(), op::CustomCall());
+  EXPECT_THAT(call.get(), op::CustomCall(c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target"));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target", c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall(::testing::StartsWith("foo")));
+  EXPECT_THAT(call.get(),
+              op::CustomCall(::testing::Not(::testing::StartsWith("bar"))));
+
+  // Wrong number of operands.
+  EXPECT_THAT(call.get(), ::testing::Not(op::CustomCall(c1.get())));
+
+  // Call target does not match.
+  EXPECT_THAT(call.get(),
+              ::testing::Not(op::CustomCall(::testing::StartsWith("bar"))));
+
+  EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")),
+              R"(custom-call with call target that isn't equal to "bar")");
+  EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")),
+              R"(custom-call with call target that is equal to "foo_target")");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 58bb94221149c9a8b550add900dff52a53565985..60270b0595dcfca8f1fcea5ab0914428880f35b5 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -38,12 +38,16 @@ HloModule::HloModule(const string& name,
     : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
       has_entry_computation_handle_(true),
-      entry_computation_handle_(entry_computation_handle) {}
+      entry_computation_handle_(entry_computation_handle),
+      unique_id_(next_unique_module_id_++) {}
 
 HloModule::HloModule(const string& name)
-    : name_(NameUniquer::GetSanitizedName(name)) {}
+    : name_(NameUniquer::GetSanitizedName(name)),
+      unique_id_(next_unique_module_id_++) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
-    : name_(NameUniquer::GetSanitizedName(name)), config_(config) {}
+    : name_(NameUniquer::GetSanitizedName(name)),
+      config_(config),
+      unique_id_(next_unique_module_id_++) {}
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -523,7 +527,15 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
 
   std::unordered_map<HloComputation*, HloComputation*> clone_map;
   for (auto& computation : computations_) {
-    auto cloned_computation = computation->Clone(suffix);
+    if (computation->IsFusionComputation()) {
+      // Cloning of a fused computation is handled by its fusion instruction.
+      continue;
+    }
+
+    // When cloning a computation, pass in the new module, so that for any
+    // fusion instruction in this computation, the fused computation will be
+    // deep cloned to the new module.
+    auto cloned_computation = computation->Clone(suffix, module.get());
     InsertOrDie(&clone_map, computation.get(), cloned_computation.get());
 
     if (entry_computation_ == computation.get()) {
@@ -537,8 +549,15 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
     for (auto* instruction : cloned_computation->instructions()) {
       // Rewrite instruction's called_computation to point to the cloned
       // computations.
-      instruction->ReplaceCalledComputations(
-          [&](HloComputation* hlo) { return FindOrDie(clone_map, hlo); });
+      instruction->ReplaceCalledComputations([&](HloComputation* hlo) {
+        if (hlo->IsFusionComputation()) {
+          // Cloning of a fused computation has already been handled when its
+          // fusion instruction is cloned. So this hlo computation is already
+          // the cloned one.
+          return hlo;
+        }
+        return FindOrDie(clone_map, hlo);
+      });
     }
   }
   return module;
@@ -549,4 +568,6 @@ uint64 HloModule::RandomNew64() const {
   return rng_();
 }
 
+/* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index e377654d024819d00f73f43a70d363bd902dc981..4bfe8d89ce0a285de6d05d4867aaa6b266d78d12 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 
+#include <atomic>
 #include <list>
 #include <memory>
 #include <random>
@@ -201,6 +202,10 @@ class HloModule {
   // this point are guaranteed to be in the range [0..NumUniqueInstructionIds())
   int NumUniqueInstructionIds() const { return next_unique_id_; }
 
+  // Returns an id that is unique to this module across all modules created over
+  // the lifetime of this process.
+  int unique_id() const { return unique_id_; }
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -227,6 +232,11 @@ class HloModule {
   NameUniquer computation_name_uniquer_{/*separator=*/"."};
   NameUniquer instruction_name_uniquer_{/*separator=*/"."};
   int next_unique_id_ = 0;
+
+  // Used to keep track of the next unique module id that should be assigned.
+  static std::atomic<int> next_unique_module_id_;
+  // A unique id to label modules with.
+  int unique_id_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 0f5d3dccb74e6e3c88e51685392171f940c03596..7f28a804bfec9c2f1bbb5fa08f7dd4e68be14d35 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -105,6 +105,48 @@ TEST_F(HloModuleTest, CloneTest) {
   }
 }
 
+TEST_F(HloModuleTest, CloneHasFusion) {
+  auto module = CreateNewModule();
+
+  // Create the fused computation.
+  HloComputation* fused_computation;
+  {
+    auto b = HloComputation::Builder("Fused");
+    auto x = b.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+    b.AddInstruction(
+        HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, x, x));
+    fused_computation = module->AddEmbeddedComputation(b.Build());
+  }
+
+  // Create the entry computation.
+  {
+    auto b = HloComputation::Builder("Entry");
+    auto input = b.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+    b.AddInstruction(
+        HloInstruction::CreateFusion(r0f32_, HloInstruction::FusionKind::kInput,
+                                     /*operands=*/{input}, fused_computation));
+    module->AddEntryComputation(b.Build());
+  }
+
+  auto post_order = module->MakeComputationPostOrder();
+  auto cloned_module = module->Clone("copy");
+  auto post_order_copied = cloned_module->MakeComputationPostOrder();
+
+  EXPECT_EQ(post_order.size(), post_order_copied.size());
+  for (auto origin = post_order.begin(), copied = post_order_copied.begin();
+       origin != post_order.end() && copied != post_order_copied.end();
+       ++origin, ++copied) {
+    if ((*origin)->name() == "Fused") {
+      // Clone of the fused computation is handled when its fusion instruction
+      // is cloned, which always use suffix ".clone".
+      EXPECT_EQ((*origin)->name() + ".clone", (*copied)->name());
+    } else {
+      EXPECT_EQ((*origin)->name() + ".copy", (*copied)->name());
+    }
+  }
+}
+
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
   auto module = CreateNewModule();
@@ -146,6 +188,12 @@ TEST_F(HloModuleTest, LargeConstantToString) {
       module->ToString(HloPrintOptions().set_print_large_constants(true)));
 }
 
+TEST_F(HloModuleTest, UniqueModuleId) {
+  auto module_a = CreateNewModule();
+  auto module_b = CreateNewModule();
+  EXPECT_NE(module_a->unique_id(), module_b->unique_id());
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 53bd46a641afcba1b9551895955742e74a9f374b..5120775737bfa32bbb656421216f2b3fbef590ea 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -32,12 +33,28 @@ using ::tensorflow::strings::StrCat;
 namespace xla {
 
 namespace {
-void DumpModule(const HloModule& module,
-                const string& message) {
+void DumpModuleGraph(const HloModule& module, const string& message) {
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
   XLA_VLOG_LINES(3, module.ToString());
 }
+
+void DumpModuleProto(const HloModule& module, const string& dump_to,
+                     const string& pipeline_name, const string& pass_name) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static auto* const module_id_to_pass_number =
+      new tensorflow::gtl::FlatMap<int64, int64>();
+
+  tensorflow::mutex_lock lock(mu);
+  const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++;
+
+  const string mod_name = SanitizeFileName(tensorflow::strings::Printf(
+      "module_%04d.%04lld.%s.after_%s", module.unique_id(), pass_number,
+      pipeline_name.c_str(), pass_name.c_str()));
+
+  TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(MakeHloProto(module),
+                                                   dump_to, mod_name));
+}
 }  // namespace
 
 StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
@@ -78,6 +95,13 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   string message;
   TF_RETURN_IF_ERROR(
       run_invariant_checkers(StrCat("before running pipeline: ", name())));
+  const string xla_dump_per_pass_hlo_proto_to =
+      module->config().debug_options().xla_dump_per_pass_hlo_proto_to();
+  if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, name().ToString(),
+                    "pipeline_start");
+  }
+
   for (auto& pass : passes_) {
     if (disabled_passes.count(pass->name().ToString()) > 0) {
       VLOG(1) << "  Skipping HLO pass " << pass->name()
@@ -90,17 +114,21 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
     StrAppend(&message, prefix, ", before ", pass->name());
-    DumpModule(*module, message);
+    DumpModuleGraph(*module, message);
 
     TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module));
     TF_RETURN_IF_ERROR(
         run_invariant_checkers(StrCat("after running pass: ", pass->name())));
+    if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+      DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
+                      name().ToString(), pass->name().ToString());
+    }
 
     changed |= changed_this_pass;
     prefix.clear();
     StrAppend(&prefix, name(), ": after ", pass->name());
   }
-  DumpModule(*module, prefix + ", pipeline end");
+  DumpModuleGraph(*module, prefix + ", pipeline end");
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index e944ad15139af0d2f98e8e68d3d48303f47ecf1c..dcc22793015147aaf3229875078b2989e4ef7559 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -18,20 +18,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 
 namespace xla {
-string HloProfilePrinter::ToString(const int64* counters,
-                                   double clock_rate_ghz) const {
+string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
+                       const int64* counters, double clock_rate_ghz) {
+  using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
+  using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
+
   string result;
 
-  for (int computation_idx = 0; computation_idx < computation_infos_size_;
-       computation_idx++) {
-    const HloComputationInfo& computation = computation_infos_[computation_idx];
-    const HloInstructionInfo* instructions_begin = computation.instructions;
-    const HloInstructionInfo* instructions_end =
-        computation.instructions + computation.instructions_size;
+  for (const HloComputationInfo& computation_info :
+       hlo_profile_printer_data.computation_infos()) {
+    const auto& instruction_infos = computation_info.instruction_infos();
     bool any_instruction_profiled =
-        std::any_of(instructions_begin, instructions_end,
+        std::any_of(instruction_infos.begin(), instruction_infos.end(),
                     [&](const HloInstructionInfo& instruction_info) {
-                      return counters[instruction_info.profile_index] != 0;
+                      return counters[instruction_info.profile_index()] != 0;
                     });
 
     if (!any_instruction_profiled) {
@@ -41,16 +41,19 @@ string HloProfilePrinter::ToString(const int64* counters,
     // Once we start using this in AOT for real, we will probably need a more
     // minimal version of HumanReadableProfileBuilder.
     HumanReadableProfileBuilder builder(
-        computation.name, counters[computation.profile_index], clock_rate_ghz);
+        computation_info.name(), counters[computation_info.profile_index()],
+        clock_rate_ghz);
 
-    for (const auto* instruction = instructions_begin;
-         instruction != instructions_end; instruction++) {
+    for (const auto& instruction_info : instruction_infos) {
       builder.AddOp(
-          /*op_name=*/instruction->long_name,
-          /*short_name=*/instruction->short_name, instruction->category,
-          counters[instruction->profile_index], instruction->flop_count,
-          instruction->transcendental_count, instruction->bytes_accessed,
-          instruction->optimal_seconds);
+          /*op_name=*/instruction_info.long_name(),
+          /*short_name=*/instruction_info.short_name(),
+          instruction_info.category(),
+          counters[instruction_info.profile_index()],
+          instruction_info.flop_count(),
+          instruction_info.transcendental_count(),
+          instruction_info.bytes_accessed(),
+          instruction_info.optimal_seconds());
     }
 
     result += builder.ToString();
@@ -58,10 +61,4 @@ string HloProfilePrinter::ToString(const int64* counters,
 
   return result;
 }
-
-HloProfilePrinter::~HloProfilePrinter() {
-  if (deleter_) {
-    deleter_(computation_infos_, computation_infos_size_);
-  }
-}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h
index 35152e744dbdae3c6701d38f99dc6c2575737b95..b72325c7554acad258c2da55a18e5e18ec1b06a6 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.h
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h
@@ -20,84 +20,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/hlo_profile_printer_data.pb.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
-// Instances of this class can pretty-print profile counters gathered from
-// running an XLA computation without having access to the backing module.
-class HloProfilePrinter {
- public:
-  // Holds meta information about an HloInstruction.
-  //
-  // The pointer-typed fields can be owning or non-owning -- this decision is
-  // manifested as the deleter_ function in the containing HloProfilePrinter.
-  struct HloInstructionInfo {
-    // Textual information for pretty printing.
-    const char* long_name;
-    const char* short_name;
-    const char* category;
-
-    // Metrics computed by HloCostAnalysis.
-    float flop_count;
-    float transcendental_count;
-    float bytes_accessed;
-    float optimal_seconds;
-
-    // The index into the profile counters array for the HloInstruction
-    // corresponding to this HloInstructionInfo.
-    int64 profile_index;
-  };
-
-  // Holds meta information about an HloComputation.
-  //
-  // The pointer-typed fields can be owning or non-owning -- this decision is
-  // manifested as the deleter_ function in the containing HloProfilePrinter.
-  struct HloComputationInfo {
-    const char* name;
-
-    // The index into the profile counters array for the HloInstruction
-    // corresponding to this HloComputationInfo.
-    int64 profile_index;
-
-    HloInstructionInfo* instructions;
-    int64 instructions_size;
-  };
-
-  HloProfilePrinter(
-      HloComputationInfo* computation_infos, int64 computation_infos_size,
-      int64 profile_counters_size,
-      std::function<void(HloComputationInfo*, int64)> deleter = nullptr)
-      : computation_infos_(computation_infos),
-        computation_infos_size_(computation_infos_size),
-        profile_counters_size_(profile_counters_size),
-        deleter_(std::move(deleter)) {}
-
-  HloProfilePrinter(HloProfilePrinter&& other) {
-    std::swap(other.computation_infos_, computation_infos_);
-    std::swap(other.computation_infos_size_, computation_infos_size_);
-    std::swap(other.deleter_, deleter_);
-  }
-
-  HloProfilePrinter(const HloProfilePrinter&) = delete;
-  HloProfilePrinter& operator=(const HloProfilePrinter&) = delete;
-
-  // Converts the profile counter sequence `counters` to a human readable string
-  // representation.
-  string ToString(const int64* counters, double clock_rate_ghz) const;
-
-  // Returns the size of the profile buffer expected by this printer.
-  int64 profile_counters_size() const { return profile_counters_size_; }
-
-  ~HloProfilePrinter();
-
- private:
-  // The `computation_infos_` field can be owning or non-owning -- this decision
-  // is manifested as the deleter_ function.
-  HloComputationInfo* computation_infos_ = nullptr;
-  int64 computation_infos_size_ = 0;
-  int64 profile_counters_size_ = 0;
-  std::function<void(HloComputationInfo*, int64)> deleter_;
-};
+// Pretty-print an array of profile counters using hlo_profile_printer_data.
+string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
+                       const int64* counters, double clock_rate_ghz);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9f22b733fe1d676b177039a9d7a3064b8638d7bc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+option cc_enable_arenas = true;
+
+// Describes how to pretty-print a profile counter array gathered for a specific
+// HloModule.
+message HloProfilePrinterData {
+  // Pretty-printer information about an HloInstruction.
+  message HloInstructionInfo {
+    string long_name = 1;
+    string short_name = 2;
+    string category = 3;
+
+    // Metrics computed by HloCostAnalysis.
+    float flop_count = 4;
+    float transcendental_count = 5;
+    float bytes_accessed = 6;
+    float optimal_seconds = 7;
+
+    // The index into the profile counters array for the HloInstruction
+    // corresponding to this HloInstructionInfo.
+    int64 profile_index = 8;
+  }
+
+  // Pretty-printer information about an HloComputation.
+  message HloComputationInfo {
+    string name = 1;
+
+    // The index into the profile counters array for the HloComputation
+    // corresponding to this HloComputationInfo.
+    int64 profile_index = 2;
+
+    // HloInstructionInfos for every HloInstruction in the HloComputation for
+    // corresponding to this HloComputattionInfo.
+    repeated HloInstructionInfo instruction_infos = 3;
+  }
+
+  // HloComputationInfos for every HloComputation in the HloModule.
+  repeated HloComputationInfo computation_infos = 1;
+
+  // The size of the profile counters array we will pretty-print.
+  int64 profile_counters_size = 2;
+}
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 204a8bf748685af71ac82be0d102cf7f76c7b38f..41b079eb799d06321a31f7d7ae0630dc8d58c46b 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -47,22 +47,11 @@ HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
   return tools::Parse(hlo_string, config);
 }
 
-/*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
-                                      const DebugOptions& debug_options) {
-  HloProto proto;
-
-  const Status s =
-      tensorflow::ReadBinaryProto(tensorflow::Env::Default(), filename, &proto);
-
-  if (!s.ok()) {
-    const Status s2 =
-        tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto);
-    if (!s2.ok()) {
-      return Status(s2.code(), s.error_message() + "\n" + s2.error_message());
-    }
-  }
+namespace {
 
+// Creates an HloModule from the given proto.
+StatusOr<std::unique_ptr<HloModule>> HloProtoToModule(
+    const HloProto& proto, const DebugOptions& debug_options) {
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
@@ -72,9 +61,29 @@ HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
   return std::move(module);
 }
 
+}  // namespace
+
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
+HloRunner::ReadModuleFromBinaryProtoFile(const std::string& filename,
                                          const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
+                                                 filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromTextProtoFile(const std::string& filename,
+                                       const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(
+      tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
+                                     const DebugOptions& debug_options) {
   string hlo_string;
   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
                                                   filename, &hlo_string));
@@ -83,19 +92,6 @@ HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
   return tools::Parse(hlo_string, config);
 }
 
-/*static*/ StatusOr<std::unique_ptr<HloModule>> HloRunner::ReadModule(
-    const std::string& filename, const DebugOptions& debug_options) {
-  auto module = HloRunner::ReadModuleFromHloProtoFile(filename, debug_options);
-  if (module.ok()) {
-    return module;
-  }
-  const std::string e = module.status().error_message();
-  module = HloRunner::ReadModuleFromHloTextDumpFile(filename, debug_options);
-  return module.ok() ? std::move(module)
-                     : Status(module.status().code(),
-                              e + "\n" + module.status().error_message());
-}
-
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct HloRunner::EigenThreadPoolWrapper {
@@ -121,12 +117,14 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteInternal(
   if (run_hlo_passes) {
     TF_ASSIGN_OR_RETURN(
         module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor()));
+                    std::move(module), backend().default_stream_executor(),
+                    /*device_allocator=*/nullptr));
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       backend().compiler()->RunBackend(std::move(module),
-                                       backend().default_stream_executor()));
+                                       backend().default_stream_executor(),
+                                       /*device_allocator=*/nullptr));
 
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index d4b221fb52dff64dda264a931df6fd19b86e5260..cbaebc68bee708090b8ccb2eae19b556c4d6d453 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -52,21 +52,15 @@ class HloRunner {
       const DebugOptions& debug_options);
 
   // Reads the proto file in xla.HloProto format, creates and returns the
-  // HloModule. Will try to parse the filename as binary proto, then try as
-  // text proto if that fails.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloProtoFile(
+  // HloModule.
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromBinaryProtoFile(
+      const std::string& filename, const DebugOptions& debug_options);
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromTextProtoFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Reads the hlo text dump file in HloModule::ToString format, creates and
   // returns the HloModule.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextDumpFile(
-      const std::string& filename, const DebugOptions& debug_options);
-
-  // Tries to parse the filename specified first as binary proto format, then
-  // as a textual proto format, then textual IR, then gives up if both fail.
-  // ReadModuleFromHloProtoFile or ReadModuleFromHloTextDumpFile should be used
-  // explicitly when you know the format, this if you don't.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModule(
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Executes the given module with given literals as input and returns the
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 6e46f945e0a2d776ab557c10fedf9b5eb393f3c2..04d4656546684063d3d6532e443ad7995c6ea8db 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -687,7 +687,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
       instructions[instruction->name()] = instruction;
     }
 
-    TF_RETURN_IF_ERROR(computation->Accept(shape_verifier_.get()));
+    std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
+    TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
   }
 
   return false;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 5a1d864e03d436bb29f7c98b9a373a19abc28a7e..26d53dec1e52f0bf19d6a8af998c56db8a850518 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -106,10 +106,14 @@ class ShapeVerifier : public DfsHloVisitor {
 class HloVerifier : public HloPassInterface {
  public:
   // Uses standard shape inference.
-  explicit HloVerifier() : shape_verifier_(MakeUnique<ShapeVerifier>()) {}
+  explicit HloVerifier()
+      : shape_verifier_factory_([] { return MakeUnique<ShapeVerifier>(); }) {}
+
   // Uses custom shape verification.
-  explicit HloVerifier(std::unique_ptr<ShapeVerifier> shape_verifier)
-      : shape_verifier_(std::move(shape_verifier)) {}
+  explicit HloVerifier(
+      std::function<std::unique_ptr<ShapeVerifier>()> shape_verifier_factory)
+      : shape_verifier_factory_(std::move(shape_verifier_factory)) {}
+
   ~HloVerifier() override = default;
   tensorflow::StringPiece name() const override { return "verifier"; }
 
@@ -121,8 +125,11 @@ class HloVerifier : public HloPassInterface {
   // CHECKs various invariants of a fusion instruction.
   Status CheckFusionInstruction(HloInstruction* fusion) const;
 
-  // Verifies shapes match inferred expectations.
-  std::unique_ptr<ShapeVerifier> shape_verifier_;
+  // Creates a ShapeVerifier that checks that shapes match inferred
+  // expectations.  This is a factory function because ShapeVerifier,  Note that
+  // ShapeVerifier, being a DfsHloVisitor, is stateful.  We want a clean object
+  // for each run of the verifier.
+  std::function<std::unique_ptr<ShapeVerifier>()> shape_verifier_factory_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 2a3b55decc5289e7e576d3c5897b333c0b1bc922..c92db0be14dceb32ea86521dcc99b8f63738e4a5 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -97,5 +97,31 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
               HasSubstr("is in a different computation"));
 }
 
+TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
+  HloComputation::Builder builder(TestName());
+  Shape s1 = ShapeUtil::MakeShape(F32, {1});
+  Shape s2 = ShapeUtil::MakeShape(F32, {2});
+
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "param"));
+
+  // Create an add instruction with the incorrect shape.
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kAdd, param, param));
+
+  // In order to trigger the bug we're checking for, the instruction with the
+  // bad shape can't be the root of the computation.
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  // Run the verifier twice.  It should fail both times, because it shouldn't
+  // carry state in its DFS visitor between runs.
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ada21345014dac70d61129aaf7bbc7466a7db914
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+namespace {
+
+// Visitor for removing implicit broadcasts.
+class ImplicitBroadcastVisitor : public DfsHloVisitorWithDefault {
+ public:
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Status::OK();
+  }
+
+  Status HandleElementwiseBinary(HloInstruction* hlo) override {
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  Status HandleClamp(HloInstruction* hlo) override {
+    // Clamp is the only element-wise ternary operation.
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  // Returns whether any modification has been made to any visited instruction.
+  bool changed() const { return changed_; }
+
+ private:
+  // Iterates through the operands of 'hlo' and replace any operands which are
+  // implicitly broadcast with the equivalent sequence of broadcast and reshape
+  // instructions. An operand is considered to be implicitly broadcast if the
+  // operand shape does have the same dimensions as the shape of 'hlo'.
+  Status ReplaceImplicitBroadcastOperands(HloInstruction* hlo) {
+    auto fadd = [hlo](std::unique_ptr<HloInstruction> x) {
+      return hlo->parent()->AddInstruction(std::move(x));
+    };
+    std::vector<HloInstruction*> operands;
+    bool operands_changed = false;
+    for (int i = 0; i < hlo->operand_count(); ++i) {
+      HloInstruction* operand = hlo->mutable_operand(i);
+      if (!ShapeUtil::SameDimensions(hlo->shape(), operand->shape())) {
+        HloInstruction* new_operand = hlo->parent()->AddInstruction(
+            HloInstruction::CreateBroadcastSequence(hlo->shape(), operand,
+                                                    fadd));
+        operands.push_back(new_operand);
+        operands_changed = true;
+      } else {
+        operands.push_back(operand);
+      }
+    }
+    if (operands_changed) {
+      // Create a new HLO instruction because the HloInstruction::Replace*
+      // methods check that the shape does not change with the replacement.
+      HloInstruction* new_hlo = hlo->parent()->AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), operands));
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
+      changed_ = true;
+    }
+    return Status::OK();
+  }
+
+  bool changed_ = false;
+};
+
+}  // namespace
+
+StatusOr<bool> ImplicitBroadcastRemover::Run(HloModule* module) {
+  VLOG(1) << "Removing implicit broadcast from module " << module->name();
+  XLA_VLOG_LINES(2,
+                 "Before removing implicit broadcasts:\n" + module->ToString());
+
+  ImplicitBroadcastVisitor visitor;
+  for (HloComputation* computation : module->computations()) {
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  }
+
+  if (visitor.changed()) {
+    // HLO instructions with implicitly broadcast operands are cloned and left
+    // for dead. Remove them.
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+
+  XLA_VLOG_LINES(2,
+                 "After removing implicit broadcasts:\n" + module->ToString());
+
+  return visitor.changed();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa325dc8a353c5bfbfded0c2774c66bfcc71c9cb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass which replaces all implicit broadcasts with their equivalent sequence of
+// explicit broadcast and reshape instructions.
+class ImplicitBroadcastRemover : public HloPassInterface {
+ public:
+  ImplicitBroadcastRemover() {}
+  ~ImplicitBroadcastRemover() override {}
+
+  tensorflow::StringPiece name() const override {
+    return "implicit-broadcast-remover";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c7b38dd1bf73e0be7b669d7215812aaef1cee17
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+ protected:
+  ImplicitBroadcastRemover remover_;
+};
+
+TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(remover_.Run(&module()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+
+  EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+
+  EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
+
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 1}), "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Subtract(op::Parameter(),
+                                 op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root,
+              op::Subtract(op::Broadcast(op::Parameter()), op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 4, 1, 8}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 1, 6, 8}), "p1"));
+  auto param2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      2, ShapeUtil::MakeShape(F32, {2, 1, 6, 8}), "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest,
+       TernaryScalarAndDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 6}), "p1"));
+  auto param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index dc63a2224d659fa427d4d1a30c5dc0f94d643b36..c83880e03090bc316b029ae9c902e27f3d8d826e 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -70,15 +70,16 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 }
 
 StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> hlo_module,
-    se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
   TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
   return std::move(hlo_module);
 }
 
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
-    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   TF_RET_CHECK(stream_exec != nullptr);
 
   VLOG(1) << "Run backend " << hlo_module->name();
@@ -96,7 +97,8 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> /*hlo_modules*/,
-    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/) {
+    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   return tensorflow::errors::Unimplemented(
       "Compilation of multiple HLO modules is not supported on Interpreter.");
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index 278cf5184227ae25518b1d46c0e16e4cce7bd1a8..c8660c04d86a82e7dfcfd1658310c2a0e4fa0083 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -45,16 +45,19 @@ class InterpreterCompiler : public Compiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) override;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> hlo_modules,
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index bbea6bee5659c73cc71f45ed5e6bbd51df26c050..5413b95cfb6aad464da27c7b4aeaed5011e16393 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1236,7 +1236,8 @@ Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
 // instruction itself.
 Status SetFusionLayouts(HloInstruction* fusion) {
   TF_RET_CHECK(fusion->opcode() == HloOpcode::kFusion);
-  for (auto* fused_instruction : fusion->fused_instructions()) {
+  for (auto* fused_instruction :
+       fusion->fused_instructions_computation()->MakeInstructionPostOrder()) {
     if (fused_instruction->opcode() == HloOpcode::kParameter) {
       const HloInstruction* fusion_operand =
           fusion->operand(fused_instruction->parameter_number());
@@ -1251,11 +1252,22 @@ Status SetFusionLayouts(HloInstruction* fusion) {
           ShapeUtil::Compatible(fusion->shape(), fused_instruction->shape()));
       TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
           fusion->shape(), fused_instruction->mutable_shape()));
-    } else if (fused_instruction->opcode() != HloOpcode::kConstant &&
-               fused_instruction->opcode() != HloOpcode::kGetTupleElement &&
-               fused_instruction->opcode() != HloOpcode::kInfeed) {
-      // Internal fused instructions with the exception of constants
-      // and infeed need no layout.
+    } else if (fused_instruction->opcode() == HloOpcode::kGetTupleElement) {
+      // A GTE inherits its layout from its operand (which should ultimately be
+      // a parameter).
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->operand(0)->shape().tuple_shapes(
+              fused_instruction->tuple_index()),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kConstant) {
+      // Give constants the layout of their literal.
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->literal().shape(),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kInfeed) {
+      // Nop; leave the infeed layout alone.
+    } else {
+      // Other instructions don't have layouts inside of fusion nodes.
       LayoutUtil::ClearLayout(fused_instruction->mutable_shape());
     }
   }
@@ -1367,20 +1379,6 @@ Status LayoutAssignment::RunOnComputation(
           << ")";
   VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
 
-  // Clear existing layouts of the instructions. All layouts must be assigned by
-  // the LayoutAssignment pass, except for Infeed, Outfeed, Parameters and the
-  // computation result. The latter two are specified in computation_layout, so
-  // we only need to keep the existing layouts for Infeed and Outfeed. Clearing
-  // the layouts here avoids hiding potential bugs in the layout assignment pass
-  // that may accidently use the existing layout.
-  for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kInfeed ||
-        instruction->opcode() == HloOpcode::kOutfeed) {
-      continue;
-    }
-    LayoutUtil::ClearLayout(instruction->mutable_shape());
-  }
-
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
 
@@ -1458,6 +1456,18 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   // is handled before its caller computation. This ensures that the layout of
   // all callers of a computation will agree.
   for (auto* computation : module->MakeComputationPostOrder()) {
+    // Clear existing layouts of the instructions.  All layouts must be assigned
+    // by the LayoutAssignment pass, except for those on infeeds, parameters,
+    // and the computation result. The latter two are specified in
+    // computation_layout, so we only need to keep the existing layouts for
+    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+    // layout assignment pass that may accidently use the existing layout.
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kInfeed) {
+        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      }
+    }
+
     if (computation == module->entry_computation()) {
       TF_RETURN_IF_ERROR(RunOnComputation(
           *entry_computation_layout_, *points_to_analysis,
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index d51c0d1dfb727801d6d2a8328eba60838373479f..e269a13459f1146f1d2952870399827d9e705e38 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -35,9 +35,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -587,5 +589,74 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
+
+// A GTE inside of a fusion node inherits the layout of its operand (which
+// should, if we keep following operands, eventually be a parameter).
+TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
+  const char* module_str = R"(
+    HloModule test_module
+
+    fused_computation {
+      fparam = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      gte0 = f32[2,2,2] get-tuple-element(fparam), index=0
+      gte1 = (f32[2,2,2], f32[2,2,2]) get-tuple-element(fparam), index=1
+      gte1a = f32[2,2,2] get-tuple-element(gte1), index=0
+      gte1b = f32[2,2,2] get-tuple-element(gte1), index=1
+      add = f32[2,2,2] add(gte1a, gte1b)
+      ROOT fresult = f32[2,2,2] add(gte0, add)
+    }
+
+    ENTRY entry_computation {
+      param = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      ROOT fusion =
+        f32[2,2,2] fusion(param), kind=kLoop, calls=fused_computation
+    }
+  )";
+
+  auto module = tools::Parse(module_str).ValueOrDie();
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape());
+  Shape param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
+       ShapeUtil::MakeTupleShape({
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {1, 2, 0}),
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {2, 0, 1}),
+       })});
+  TF_ASSERT_OK(
+      computation_layout.mutable_parameter_layout(0)->CopyLayoutFromShape(
+          param_shape));
+  computation_layout.mutable_result_layout()->ResetLayout(
+      LayoutUtil::MakeLayout({2, 1, 0}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  HloComputation* fused_computation = *std::find_if(
+      module->computations().begin(), module->computations().end(),
+      [](const HloComputation* c) { return c->name() == "fused_computation"; });
+
+  auto fused_instr = [&](const string& name) {
+    auto it = std::find_if(
+        fused_computation->instructions().begin(),
+        fused_computation->instructions().end(),
+        [&](const HloInstruction* i) { return i->name() == name; });
+    CHECK(it != fused_computation->instructions().end());
+    return *it;
+  };
+
+  EXPECT_THAT(fused_instr("gte0")->shape().layout().minor_to_major(),
+              ElementsAre(0, 1, 2));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(0).layout().minor_to_major(),
+      ElementsAre(1, 2, 0));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(1).layout().minor_to_major(),
+      ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("gte1a")->shape().layout().minor_to_major(),
+              ElementsAre(1, 2, 0));
+  EXPECT_THAT(fused_instr("gte1b")->shape().layout().minor_to_major(),
+              ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("fresult")->shape().layout().minor_to_major(),
+              ElementsAre(2, 1, 0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 34f3419269abbc73cd0ddb13c723a8da38ab19ff..f98fc0400a7d827a29dcddc5eecf9a4a01e76590 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-        stream_execs) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    DeviceMemoryAllocator* device_allocator) {
   std::vector<std::unique_ptr<Executable>> result;
   for (size_t i = 0; i < modules.size(); i++) {
     if (stream_execs[i].size() != 1) {
@@ -27,10 +27,12 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
           "Model partitioning not implemented for the CPU/GPU compilers!");
     }
 
-    TF_ASSIGN_OR_RETURN(
-        modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0]));
+    TF_ASSIGN_OR_RETURN(modules[i],
+                        RunHloPasses(std::move(modules[i]), stream_execs[i][0],
+                                     device_allocator));
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        RunBackend(std::move(modules[i]), stream_execs[i][0]));
+                        RunBackend(std::move(modules[i]), stream_execs[i][0],
+                                   device_allocator));
     result.push_back(std::move(executable));
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index c5393cef4f961c5d04c32d0d4291732b8ec702f1..d74e81bb7f622ac5e89203a3d02ca5ad839da07e 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -60,17 +60,20 @@ class LLVMCompiler : public Compiler {
   // Bring in
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs) override;
+          stream_execs,
+      DeviceMemoryAllocator* device_allocator) override;
 
  protected:
   ModuleHook user_pre_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 2194d24257d0ccd04f3c9625412116eba01acd8c..07f989d4faea199e812e54d2ae74d3ff9e7fa19a 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -71,7 +72,7 @@ LocalService::LocalService(const ServiceOptions& options,
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const Shape* result_layout, int device_ordinal) {
+    const ExecutableBuildOptions& build_options) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(computation));
   VersionedComputationHandle versioned_handle =
@@ -112,14 +113,19 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
           ShapeUtil::HumanString(argument_shape).c_str());
     }
   }
-  if (result_layout != nullptr) {
-    TF_RETURN_IF_ERROR(
-        ValidateResultShapeWithLayout(*result_layout, program_shape->result()));
+  if (build_options.result_layout() != nullptr) {
+    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
+        *build_options.result_layout(), program_shape->result()));
   }
 
   ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (result_layout != nullptr) {
-    *execution_options.mutable_shape_with_output_layout() = *result_layout;
+  if (build_options.generate_hlo_graph().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
+        build_options.generate_hlo_graph().value());
+  }
+  if (build_options.result_layout() != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *build_options.result_layout();
   } else {
     *execution_options.mutable_shape_with_output_layout() =
         program_shape->result();
@@ -128,13 +134,16 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, argument_layouts, &execution_options));
+      CreateModuleConfig(*program_shape, argument_layouts, &execution_options,
+                         *user_computation));
 
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                      execute_backend_->stream_executor(device_ordinal));
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(build_options.device_ordinal()));
 
   return BuildExecutable(versioned_handle, std::move(module_config),
-                         execute_backend_.get(), executor);
+                         execute_backend_.get(), executor,
+                         build_options.device_allocator());
 }
 
 StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index acbc7268252881958190f416ab936d64430166e1..15e120685e1be9190d49fdaf5ed6706bdf991a6c 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -41,11 +42,13 @@ class LocalService : public Service {
 
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
-  // result of the given layout.
+  // result of the given layout.  If device_allocator is non-null, then the
+  // compiler may use it to allocate temp space on the device.  The compiler is
+  // responsible for freeing any memory it allocates this way.
   StatusOr<std::unique_ptr<Executable>> CompileExecutable(
       const ComputationHandle& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const Shape* result_layout, int device_ordinal);
+      const ExecutableBuildOptions& options);
 
   // Returns the device ordinal that corresponds to the given replica number.
   //
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index e230d25f1e34d39f8bb61221311cb1b8f62647d9..a57b7e571788f0d3e3ffda31eb94aa81ee85b574 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -56,6 +57,7 @@ namespace se = ::perftools::gputools;
 
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
+using ::xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
 
@@ -261,7 +263,8 @@ StatusOr<std::vector<const ShapedBuffer*>> Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-    const ExecutionOptions* execution_options) {
+    const ExecutionOptions* execution_options,
+    const UserComputation& user_computation) {
   auto config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = config->mutable_entry_computation_layout();
 
@@ -275,8 +278,10 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
                                program_shape.parameters(i))) {
-      return InvalidArgument(
-          "computation expects parameter %d to have shape %s, given shape %s",
+      return InvalidParameterArgument(
+          *user_computation.ParameterMetadata(i).value(),
+          "Argument does not match shape of computation parameter %d: want %s, "
+          "got %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
           ShapeUtil::HumanString(*argument_shapes[i]).c_str());
     }
@@ -318,19 +323,22 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options,
+    const UserComputation& user_computation) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  return CreateModuleConfig(program_shape, argument_shapes, &execution_options);
+  return CreateModuleConfig(program_shape, argument_shapes, &execution_options,
+                            user_computation);
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<VersionedComputationHandle> versioned_handles,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
@@ -376,7 +384,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), std::move(executors)));
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
 
   for (size_t i = 0; i < versioned_handles.size(); ++i) {
     if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
@@ -389,8 +398,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    Backend* backend, se::StreamExecutor* executor) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
                     versioned_handle.ToString().c_str());
 
@@ -423,11 +432,12 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
 
   TF_ASSIGN_OR_RETURN(
-      module, backend->compiler()->RunHloPasses(std::move(module), executor));
+      module, backend->compiler()->RunHloPasses(std::move(module), executor,
+                                                device_allocator));
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      backend->compiler()->RunBackend(std::move(module), executor));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      backend->compiler()->RunBackend(
+                          std::move(module), executor, device_allocator));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -438,9 +448,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    Backend* backend, perftools::gputools::StreamExecutor* executor,
-    ExecutionProfile* profile) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    DeviceMemoryAllocator* device_allocator) {
   std::shared_ptr<Executable> executable =
       compilation_cache_.LookUp(versioned_handle, *module_config);
 
@@ -462,7 +472,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable_unique_ptr,
       BuildExecutable(versioned_handle, std::move(module_config), backend,
-                      executor));
+                      executor, device_allocator));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -569,7 +579,7 @@ Service::ExecuteParallelAndRegisterResult(
     se::Stream* stream = index_to_profiled_stream.second;
     Executable* executable = executables[device];
     const HloModule& module = executable->module();
-    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(),
+    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer_data(),
                                     &executable->hlo_profile_index_map());
     TF_RETURN_IF_ERROR(
         executable->PopulateExecutionProfile(&hlo_profile, stream->parent()));
@@ -742,9 +752,10 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
     // Create an HloModuleConfig object for the computation, given the shape of
     // the program and the argument allocations.
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                        CreateModuleConfig(*program_shape, arguments,
-                                           request.execution_options()));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(*program_shape, arguments,
+                           request.execution_options(), *user_computation));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -763,10 +774,14 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
   // Build the user computations into HloModules and compile to generate the
   // executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteParallel, so we have to pass a null device_allocator below.
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
       BuildExecutables(versioned_handles, std::move(module_configs),
-                       execute_backend_.get(), all_executors));
+                       execute_backend_.get(), all_executors,
+                       /*device_allocator=*/nullptr));
   std::vector<Executable*> executable_ptrs;
   executable_ptrs.reserve(executables.size());
   for (const auto& executable : executables) {
@@ -852,7 +867,8 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arguments, arg->execution_options()));
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
+                         *user_computation));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -916,7 +932,8 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arguments, arg->execution_options()));
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
+                         *user_computation));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -1236,7 +1253,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options));
+                      CreateModuleConfig(program_shape, {}, execution_options,
+                                         *user_computation));
 
   // Exclude dead parameter instructions for the purpose of computing constants.
   TF_ASSIGN_OR_RETURN(
@@ -1601,14 +1619,14 @@ StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
 }
 
 Status Service::MaybeDumpHloModule(const HloModule& module) const {
-  const string xla_dump_prepass_hlo_proto_to =
-      module.config().debug_options().xla_dump_prepass_hlo_proto_to();
-  if (xla_dump_prepass_hlo_proto_to.empty()) {
+  const string xla_dump_unoptimized_hlo_proto_to =
+      module.config().debug_options().xla_dump_unoptimized_hlo_proto_to();
+  if (xla_dump_unoptimized_hlo_proto_to.empty()) {
     return Status::OK();
   }
   HloProto proto = MakeHloProto(module);
   return protobuf_util::DumpProtoToDirectory(
-      proto, xla_dump_prepass_hlo_proto_to, module.name());
+      proto, xla_dump_unoptimized_hlo_proto_to, module.name());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 0a7d0b3a7d25a1b046852c87d8463d0169080a5e..6ce241971156599aaa25aea1b0caac0e1bd5379c 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -251,7 +251,8 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutionOptions& execution_options);
+      const ExecutionOptions& execution_options,
+      const UserComputation& user_computation);
 
  protected:
   friend class LocalExecutable;
@@ -275,13 +276,19 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-      const ExecutionOptions* execution_options);
+      const ExecutionOptions* execution_options,
+      const UserComputation& user_computation);
 
   // Builds an Executable for the given parameters.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // buffers, which the compiler is responsible for freeing.  The allocator
+  // given here need not match the allocator used when running the executable.
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      Backend* backend, perftools::gputools::StreamExecutor* executor);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
@@ -289,16 +296,17 @@ class Service : public ServiceInterface {
       std::vector<VersionedComputationHandle> versioned_handles,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors);
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
   // executable first. If the executable is not in the cache, it is built and
   // inserted into the cache.
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      Backend* backend, perftools::gputools::StreamExecutor* executor,
-      ExecutionProfile* profile);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index a6d6c8b27f81045a4bee09e056c5c8f8e8a330c7..4ba6da6ccc44be8f3c70d2af80b30f0b2e388c2a 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
+using tensorflow::str_util::Join;
+using tensorflow::strings::Printf;
+
 namespace xla {
 
 namespace {
@@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
       BinaryOperation_Name(operation).c_str(),
       ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
-      tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
+      Join(broadcast_dimensions, ", ").c_str());
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
@@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
         "%s",
-        tensorflow::str_util::Join(pieces, ", ").c_str());
+        Join(pieces, ", ").c_str());
   }
 
   // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (dimensions[i] != i) {
       return InvalidArgument(
           "Map requires monotonically increasing dimension numbers, found: %s ",
-          tensorflow::str_util::Join(dimensions, ", ").c_str());
+          Join(dimensions, ", ").c_str());
     }
   }
 
@@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
     tensorflow::gtl::ArraySlice<int64> limits,
     tensorflow::gtl::ArraySlice<int64> strides) {
+  auto error = [&](const string& message) {
+    return InvalidArgument(
+        "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
+        "{%s}; strides: {%s}",
+        message.c_str(), ShapeUtil::HumanString(arg).c_str(),
+        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
+        Join(strides, ",").c_str());
+  };
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(),
-      tensorflow::str_util::Join(starts, ", ").c_str(),
-      tensorflow::str_util::Join(limits, ", ").c_str());
+      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
+      Join(limits, ", ").c_str());
 
   if (starts.size() != limits.size()) {
-    return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
-                           starts.size(), limits.size());
+    return error(Printf("slice start and limit sizes differ: %zu vs %zu",
+                        starts.size(), limits.size()));
   }
 
   if (starts.size() != strides.size()) {
-    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
-                           starts.size(), strides.size());
+    return error(Printf("slice start and strides sizes differ: %zu vs %zu",
+                        starts.size(), strides.size()));
   }
 
   if (starts.size() != ShapeUtil::Rank(arg)) {
@@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                              start_index);
     }
     if (limit_index > arg.dimensions(dimension)) {
-      return InvalidArgument(
-          "limit index (%lld) must be less than or equal to dimension "
-          "size (%lld)",
-          limit_index, arg.dimensions(dimension));
+      return error(
+          Printf("limit index (%lld) must be less than or equal to dimension "
+                 "size (%lld)",
+                 limit_index, arg.dimensions(dimension)));
     }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
     if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice with positive stride",
-          limit_index, start_index);
+      return error(
+          Printf("limit index (%lld) must be greater or equal to "
+                 "start index (%lld) in slice with positive stride",
+                 limit_index, start_index));
     }
     if (stride <= 0) {
       return InvalidArgument("stride (%lld) must be positive", stride);
@@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
       ShapeUtil::HumanString(operand_shape).c_str(),
       ShapeUtil::HumanString(start_indices_shape).c_str(),
-      tensorflow::str_util::Join(slice_sizes, ", ").c_str());
+      Join(slice_sizes, ", ").c_str());
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
@@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Reshape dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
-        ShapeUtil::HumanString(operand).c_str());
+        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
   }
 
   return inferred_shape;
@@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // The applied function's arity equals the number of arguments.
   if (arg_shapes.size() != to_apply.parameters_size()) {
     string computation_signature = ShapeUtil::HumanString(to_apply);
-    string argument_shapes = tensorflow::str_util::Join(
-        arg_shapes, ", ", [](string* out, const Shape* shape) {
+    string argument_shapes =
+        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
           tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
         });
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 99d87f3b550ae72befe254f23fad080dd210aaf4..026c021165785bd3945d6a846dae446ad45da9b7 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
                         "must have the same shape"));
 }
 
+TEST_F(ShapeInferenceTest, BadSlice) {
+  auto arg = ShapeUtil::MakeShape(F32, {4});
+  StatusOr<Shape> statusor =
+      ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
+  ASSERT_FALSE(statusor.ok());
+
+  LOG(INFO) << statusor.status();
+
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("less than or equal to dimension size"))
+      << statusor.status();
+  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/source_map_util.cc b/tensorflow/compiler/xla/service/source_map_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cbaac7b3760717bcacb57adc8782a5755c0aa6d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/source_map_util.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/source_map_util.h"
+
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace source_map_util {
+namespace {
+
+Status InvalidParameterArgumentV(const OpMetadata& op_metadata,
+                                 const char* format, va_list args) {
+  string message;
+  tensorflow::strings::Appendv(&message, format, args);
+  if (!op_metadata.source_file().empty()) {
+    tensorflow::strings::Appendf(&message, " (%s:%d)",
+                                 op_metadata.source_file().c_str(),
+                                 op_metadata.source_line());
+  }
+  return InvalidArgument("%s", message.c_str());
+}
+
+}  // namespace
+
+Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  Status result = InvalidParameterArgumentV(op_metadata, format, args);
+  va_end(args);
+  return result;
+}
+
+Status InvalidParameterArgument(Executable* executable, int parameter_number,
+                                const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  if (executable != nullptr && executable->has_module()) {
+    const HloModule& module = executable->module();
+    const HloComputation& computation = *module.entry_computation();
+    HloInstruction* param = computation.parameter_instruction(parameter_number);
+    const OpMetadata& metadata = param->metadata();
+    Status result = InvalidParameterArgumentV(metadata, format, args);
+    va_end(args);
+    return result;
+  }
+  Status result = InvalidArgumentV(format, args);
+  va_end(args);
+  return result;
+}
+
+}  // namespace source_map_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..a776d745f4e56ca4f3d2480740259832bbc85011
--- /dev/null
+++ b/tensorflow/compiler/xla/service/source_map_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace source_map_util {
+
+// Creates an INVALID_ARUGMENT status with the given format string.
+//
+// Also, attempts to extract the OpMetadata for parameter_number on executable
+// and append it to the status message for source mapping to user code.
+//
+// executable may be nullptr, but parameter_number should not be out of bounds
+// or a CHECK-failure may occur.
+Status InvalidParameterArgument(Executable* executable, int parameter_number,
+                                const char* format, ...)
+    TF_PRINTF_ATTRIBUTE(3, 4);
+
+// As above, but takes the parameter metadata directly instead of extracting it
+// from the executable.
+Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                const char* format, ...)
+    TF_PRINTF_ATTRIBUTE(2, 3);
+
+}  // namespace source_map_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index cba73322fa924785fbc73a4e931b5f27227d89b9..d63e16ce2bf51cff0d113640d31ec6e70bfaf421 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -475,8 +475,6 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
       if (LayoutUtil::HasLayout(shape)) {
         tensorflow::strings::StrAppend(&result,
                                        LayoutUtil::HumanString(shape.layout()));
-      } else {
-        tensorflow::strings::StrAppend(&result, "{no layout}");
       }
     }
     return result;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6ab406d6c3ad8b50f28dcace286958f15e68d621..d4820d1b6d6d8c848794b120a9064609ba72e5a9 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -351,6 +351,7 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -577,6 +578,7 @@ xla_test(
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -848,7 +850,8 @@ xla_test(
     name = "half_test",
     srcs = ["half_test.cc"],
     backends = [
-        "cpu",
+        # TODO(b/72509305): Flaky (fails with SEGV) as of 2018-01-25
+        # "cpu",
         "gpu",
     ],
     deps = [
@@ -1034,6 +1037,10 @@ xla_test(
     name = "select_and_scatter_test",
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+        "optonly",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -1072,6 +1079,19 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "reduce_hlo_test",
+    srcs = ["reduce_hlo_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 56fc21d019bb823f8f4631420a15fd607ef46a9a..52e14a1f7b9e241b9125e6086b359e7365e221bf 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1893,6 +1893,26 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0, -5});
+  auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<int32>({3, 0, 25, 5, 123, -1});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<int32>(&builder, {2, 0, 1, 2, 4, -1}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<uint32>({1, 2, 1, 2, 0, ~0u - 4});
+  auto arg_vector = builder.ConstantR1<uint32>({2, 10, 5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<uint32>({3, 5, 25, 5, 123, ~0u});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<uint32>(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
   ComputationBuilder builder(client_, TestName());
 
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 659660d91e519b428d28ced8591d05b4e4d45f53..f594cc10ac6496f710d03f0b0b134e6dd3b6d38f 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -104,7 +104,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 0"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 0"));
 
   // Shape mismatch in parameter 1 (rank)
   status = client_->Execute(computation, {f32_data.get(), f32_data.get()},
@@ -112,7 +113,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 1"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 1"));
 
   // Shape mismatch in parameter 1 (element type)
   status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()},
@@ -120,7 +122,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 1"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 1"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index e472408dcf7ed5fec74e886fd0092ce47ee2e7eb..022641394f113ef28e7c53058385d77572822213 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -21,9 +21,11 @@ StatusOr<std::unique_ptr<Executable>> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
   TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses(
                                       std::move(hlo_module),
-                                      backend().default_stream_executor()));
+                                      backend().default_stream_executor(),
+                                      /*device_allocator=*/nullptr));
   return backend().compiler()->RunBackend(std::move(hlo_module),
-                                          backend().default_stream_executor());
+                                          backend().default_stream_executor(),
+                                          /*device_allocator=*/nullptr);
 }
 
 StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index 0016b6cc614469d7ac9b40b740d163a7a4f32abf..bc821674820fb128823786d7149037fc59b22ab6 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -355,8 +355,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
 }
 
 // Test true and false computations that return a tuple of arrays.
-// TODO(b/71715476): Returning tuples from Conditional fails in GPU backend.
-XLA_TEST_F(ConditionalOpTest, DISABLED_ON_GPU(ReturnTupleOfArrays)) {
+XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
   ComputationBuilder builder(client_, TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({builder.ConstantR1<float>({12.2f, 15.8f}),
@@ -373,9 +372,7 @@ XLA_TEST_F(ConditionalOpTest, DISABLED_ON_GPU(ReturnTupleOfArrays)) {
 
 // Test true and false computations that return a tuple of a predicate, a
 // scalar, and an array.
-// TODO(b/71715476): Returning tuples from Conditional fails in GPU backend.
-XLA_TEST_F(ConditionalOpTest,
-           DISABLED_ON_GPU(ReturnTupleofPredicateScalarArray)) {
+XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   ComputationBuilder true_builder(client_, TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
@@ -413,8 +410,7 @@ XLA_TEST_F(ConditionalOpTest,
 }
 
 // Test true and false computations that return a nested tuple.
-// TODO(b/71715476): Returning tuples from Conditional fails in GPU backend.
-XLA_TEST_F(ConditionalOpTest, DISABLED_ON_GPU(ReturnNestedTuple)) {
+XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   ComputationBuilder true_builder(client_, TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
@@ -532,6 +528,32 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
 
+XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  {
+    Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+    Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
+    auto param0 = inner_builder.Parameter(0, tuple_shape, "param0");
+    auto pred_cond = inner_builder.GetTupleElement(param0, 0);
+    auto true_operand = inner_builder.GetTupleElement(param0, 1);
+    auto false_operand = inner_builder.GetTupleElement(param0, 2);
+    inner_builder.Conditional(pred_cond, true_operand,
+                              CreateR0CeilComputation(), false_operand,
+                              CreateR0FloorComputation());
+  }
+  auto inner_builder_result = inner_builder.Build();
+  EXPECT_IS_OK(inner_builder_result.status());
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred2 = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(1.1f);
+  auto operand2 = builder.ConstantR0<float>(12.2f);
+  auto tuple_operand = builder.Tuple({pred2, operand1, operand2});
+  builder.Call(inner_builder_result.ConsumeValueOrDie(), {tuple_operand});
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
 // Test a mismatch in the shape of the true operand and true computation.
 XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
   ComputationBuilder builder(client_, TestName());
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 7c1a993b478a0e0878e85c0e4192da053e33619f..9f5806c5e16c30cf198027cffab5f78c315cb957 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -230,7 +230,7 @@ template <typename LiteralPtr>
     const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
-      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
     return ::testing::AssertionFailure()
            << "failed reading hlo module from file";
@@ -258,7 +258,7 @@ template <typename LiteralPtr>
     const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
-      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
     return ::testing::AssertionFailure()
            << "failed reading hlo module from file";
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index f8205de702fb3534dcd7dbdce6ee0cbfb11d6ee4..474d2547aeba1ec478eb3aa0cacfc04d9dee142e 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -355,9 +355,9 @@ class NearComparator {
   // temporary files on failure. Returns true if  literals match.
   bool ExpectNear(const Literal& expected, const Literal& actual) {
     VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, expected.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected));
     VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, actual.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual));
 
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
@@ -376,7 +376,12 @@ class NearComparator {
     abs_expected_miscompare_sum_ = 0.0;
     max_rel_err_ = 0.0;
     max_abs_err_ = 0.0;
+    first_linear_index_ = -1;
+    last_linear_index_ = -1;
+    max_rel_linear_index_ = -1;
+    max_abs_linear_index_ = -1;
     miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
+    miscompares_.PopulateWithValue(false);
     multi_index_.resize(expected.shape().dimensions_size(), 0);
 
     switch (expected.shape().element_type()) {
@@ -404,21 +409,33 @@ class NearComparator {
     if (num_miscompares_ > 0) {
       if (!VLOG_IS_ON(1)) {
         LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << expected.ToString();
+                  << " " << TruncateHugeLiteral(expected);
         LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << actual.ToString();
+                  << " " << TruncateHugeLiteral(actual);
+        LOG(INFO) << "Dumping literals to temp files...";
+        WriteLiteralToTempFile(expected, "expected");
+        WriteLiteralToTempFile(actual, "actual");
+        WriteLiteralToTempFile(miscompares_, "miscompares");
       }
       EXPECT_TRUE(num_miscompares_ == 0)
           << "\nmax relative mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_rel_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_rel_linear_index_))
           << "\nmaximum relative error " << max_rel_err_
           << "\nmax absolute mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_abs_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_abs_linear_index_))
           << "\nmaximum absolute error " << max_abs_err_
           << "\nfirst mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(first_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), first_linear_index_))
           << "\nlast mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(last_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), last_linear_index_))
           << "\ntotal absolute error " << abs_diff_sum_
           << "\ntotal absolute error of miscompares "
           << abs_diff_miscompare_sum_ << "\ntotal relative error "
@@ -426,10 +443,6 @@ class NearComparator {
           << "\ntotal relative error of miscompares "
           << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_)
           << "\nfailure count " << num_miscompares_;
-
-      WriteLiteralToTempFile(expected, "expected");
-      WriteLiteralToTempFile(actual, "actual");
-      WriteLiteralToTempFile(miscompares_, "miscompares");
     }
     return num_miscompares_ == 0;
   }
@@ -457,57 +470,93 @@ class NearComparator {
       return true;
     }
 
-    float abs_diff = std::abs(actual - expected);
-    float rel_err = abs_diff / std::abs(expected);
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
+    const bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
+    const bool mismatch =
+        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
+    return !mismatch;
+  }
+
+  // Assumes that expected vs actual fail ExpectValuesNear.
+  template <typename NativeT>
+  void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual,
+                               const Shape& shape, const int64 linear_index) {
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
     abs_diff_sum_ += abs_diff;
     abs_expected_sum_ += std::abs(expected);
-    if (rel_err > max_rel_err_) {
+    if (rel_err > max_rel_err_ || std::isnan(rel_err)) {
       max_rel_err_ = rel_err;
-      max_rel_multi_index_ = multi_index_;
+      max_rel_linear_index_ = linear_index;
     }
-    if (abs_diff > max_abs_err_) {
+    if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) {
       max_abs_err_ = abs_diff;
-      max_abs_multi_index_ = multi_index_;
+      max_abs_linear_index_ = linear_index;
     }
-    VLOG(10) << tensorflow::strings::Printf(
-        "index %s abs_diff %f rel_err %f",
-        LiteralTestUtil::MultiIndexAsString(multi_index_).c_str(), abs_diff,
-        rel_err);
-    bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
-    bool mismatch =
-        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
-    if (mismatch) {
-      abs_diff_miscompare_sum_ += abs_diff;
-      abs_expected_miscompare_sum_ += std::abs(expected);
-      const int64 kMaxFailures = 2;
-      if (num_miscompares_ < kMaxFailures) {
-        ::testing::Message msg;
-        msg << "mismatch at index "
-            << LiteralTestUtil::MultiIndexAsString(multi_index_) << " abs diff "
-            << abs_diff << " rel err " << rel_err << " failure #"
-            << num_miscompares_;
-        ExpectNear<NativeT>(expected, actual, msg);
-      } else if (num_miscompares_ == kMaxFailures) {
-        LOG(ERROR)
-            << "reached max 'loud' failure count; silently proceeding...";
-      }
-      if (num_miscompares_ == 0) {
-        first_multi_index_ = multi_index_;
-      }
-      num_miscompares_++;
-      last_multi_index_ = multi_index_;
+    if (VLOG_IS_ON(10)) {
+      VLOG(10) << tensorflow::strings::Printf(
+          "index %s abs_diff %f rel_err %f",
+          LiteralTestUtil::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          abs_diff, rel_err);
     }
-    return !mismatch;
+    abs_diff_miscompare_sum_ += abs_diff;
+    abs_expected_miscompare_sum_ += std::abs(expected);
+    const int64 kMaxFailures = 2;
+    if (num_miscompares_ < kMaxFailures) {
+      const auto multi_index =
+          IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
+      ::testing::Message msg;
+      msg << "mismatch at index "
+          << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff "
+          << abs_diff << " rel err " << rel_err << " failure #"
+          << num_miscompares_;
+      ExpectNear<NativeT>(expected, actual, msg);
+    } else if (num_miscompares_ == kMaxFailures) {
+      LOG(ERROR) << "reached max 'loud' failure count; silently proceeding...";
+    }
+    if (num_miscompares_ == 0) {
+      first_linear_index_ = linear_index;
+    }
+    num_miscompares_++;
+    last_linear_index_ = linear_index;
+    miscompares_.data<bool>()[linear_index] = true;
   }
 
   // Recursive function which compares the two given literals elementwise.
   template <typename NativeT>
   void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
                           int64 dimension) {
+    // Fast path optimization for the case were layouts match.
+    if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) {
+      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
+          expected.data<NativeT>();
+      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
+          actual.data<NativeT>();
+      const int64 len = expected_data.size();
+      for (int64 i = 0; i < len; ++i) {
+        const bool near = ExpectValuesNear(expected_data[i], actual_data[i]);
+        if (!near) {
+          UpdateAndLogMiscompares<NativeT>(expected_data[i], actual_data[i],
+                                           actual.shape(), i);
+        }
+      }
+      return;
+    }
+
     if (dimension == expected.shape().dimensions_size()) {
       bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
                                    actual.Get<NativeT>(multi_index_));
-      miscompares_.Set<bool>(multi_index_, !near);
+      if (!near) {
+        UpdateAndLogMiscompares<NativeT>(
+            expected.Get<NativeT>(multi_index_),
+            actual.Get<NativeT>(multi_index_), actual.shape(),
+            IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(),
+                                                          multi_index_));
+      }
     } else {
       for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
         multi_index_[dimension] = i;
@@ -528,6 +577,32 @@ class NearComparator {
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
+  // Gets the total element count.  For tuples, this is not the count of tuple
+  // elements, but the sum of elements of each tuple element.
+  int64 RecursiveElementCount(const Shape& shape) {
+    if (ShapeUtil::IsTuple(shape)) {
+      const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+      int64 total = 0;
+      for (int64 i = 0; i < tuple_elements; ++i) {
+        total +=
+            RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+      }
+      return total;
+    } else {
+      return ShapeUtil::ElementsIn(shape);
+    }
+  }
+
+  // Calling ToString on a literal with over 100 million elements takes around
+  // 3 minutes.  The utility of printing a literal with >1000 elements is
+  // questionable, especially when writing the Literal proto to disk is orders
+  // of magnitude faster.
+  string TruncateHugeLiteral(const Literal& literal) {
+    return RecursiveElementCount(literal.shape()) < 1000
+               ? literal.ToString()
+               : "[TRUNCATED, Literal with more than 1000 values]";
+  }
+
   ErrorSpec error_;
 
   // Number of element miscomparisons encountered so far.
@@ -548,10 +623,10 @@ class NearComparator {
   double abs_expected_miscompare_sum_;
   float max_rel_err_;
   float max_abs_err_;
-  std::vector<int64> first_multi_index_;
-  std::vector<int64> last_multi_index_;
-  std::vector<int64> max_rel_multi_index_;
-  std::vector<int64> max_abs_multi_index_;
+  int64 first_linear_index_;
+  int64 last_linear_index_;
+  int64 max_rel_linear_index_;
+  int64 max_abs_linear_index_;
 };
 
 template <>
@@ -584,6 +659,23 @@ bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
                           static_cast<float>(std::move(actual)));
 }
 
+template <>
+void NearComparator::UpdateAndLogMiscompares<bfloat16>(
+    const bfloat16 expected, const bfloat16 actual, const Shape& shape,
+    const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(expected),
+                          static_cast<float>(actual), shape, linear_index);
+}
+
+template <>
+void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
+                                                   const Shape& shape,
+                                                   const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(std::move(expected)),
+                          static_cast<float>(std::move(actual)), shape,
+                          linear_index);
+}
+
 }  // namespace
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index e477784557a3b9340cff644a3695485389d8cc22..3a421f8458268a14dcdd84889bcae4990c095ea4 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -97,5 +97,29 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   }
 }
 
+TEST(LiteralTestUtilTest, NearComparatorR1) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, NearComparatorR1Nan) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtil, NearComparatorDifferentLengths) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b = Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7});
+  EXPECT_FALSE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+  EXPECT_FALSE(LiteralTestUtil::Near(*b, *a, ErrorSpec{0.0001}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index b5b95967ff9162301a092f3a57996e0f3f78658f..7e92439c494b677f718a63c71c20828d65bebef4 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -74,7 +74,8 @@ class LLVMCompilerTest : public ::testing::Test {
 
     ASSERT_TRUE(compiler
                     ->RunBackend(std::move(hlo_module),
-                                 backend_->default_stream_executor())
+                                 backend_->default_stream_executor(),
+                                 /*device_allocator=*/nullptr)
                     .ok());
 
     // Test that hooks were called.
@@ -98,7 +99,8 @@ class LLVMCompilerTest : public ::testing::Test {
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
-    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors)));
+    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors),
+                                   /*device_allocator=*/nullptr));
   }
 
  private:
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test.cc b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
index 569d5944cab0ae8f6a7b58a651285d20d4f9d019..47cab796041e9669affaebd7866d0d80100730f1 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
@@ -44,8 +44,7 @@ TEST_F(LocalClientAotTest, Constant) {
   OpaqueData opaque_data{100, 20, 3};
   void* parameters[] = {&opaque_data};
   float out = 0;
-  char tmp[4] = {0};
-  void* temporary_buffers[] = {nullptr, &out, &tmp};
+  void* temporary_buffers[] = {nullptr, &out};
   SumAndDouble(&out, &run_options, parameters, temporary_buffers);
   EXPECT_EQ(out, 246.0f);
 
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 4d3b513b092e0b447a1452a3809fb7099e54dbb9..3704ddd8010bf727b75ff81b63605e8b7ffe2ca8 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -87,10 +87,9 @@ int main(int argc, char** argv) {
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
   CHECK_EQ(result->result_buffer_index(), 1);
-  CHECK_EQ(result->buffer_sizes().size(), 3);
+  CHECK_EQ(result->buffer_sizes().size(), 2);
   CHECK_EQ(result->buffer_sizes()[0], -1);             // param buffer
   CHECK_EQ(result->buffer_sizes()[1], sizeof(float));  // result buffer
-  CHECK_EQ(result->buffer_sizes()[2], sizeof(float));  // temp buffer
   if (triple.isOSBinFormatELF()) {
     // Check the ELF magic.
     CHECK_EQ(result->object_file_data()[0], 0x7F);
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 3fd83a4c3b104831f03366339fb7b8b5d816a3f7..8cef8dd34dc7b16b1e58ded67d6b6a4ba79f20db 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -33,6 +33,14 @@ limitations under the License.
 namespace xla {
 namespace {
 
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
 class PadTest : public ClientLibraryTestBase {
  protected:
   PadTest() {
@@ -61,8 +69,22 @@ class PadTest : public ClientLibraryTestBase {
   PaddingConfig r4_padding_on_dim0_dim1_;
 };
 
+class PadTestFloat : public PadTest,
+                     public ::testing::WithParamInterface<bool> {
+ protected:
+  PadTestFloat() { set_use_bfloat16(GetParam()); }
+
+  ErrorSpec DefaultErrorSpec() const {
+    if (use_bfloat16()) {
+      return ErrorSpec(1e-3, 1e-3);
+    } else {
+      return ErrorSpec(1e-5, 1e-5);
+    }
+  }
+};
+
 // Tests a Pad() with a zero-element input and output.
-XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
@@ -71,12 +93,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(0);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
-  ComputeAndCompareR1<float>(&b, {}, {}, ErrorSpec(0.0001));
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
+  ComputeAndCompareR1<float>(&b, {}, {}, DefaultErrorSpec());
 }
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
-XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -85,12 +108,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
   dimension->set_edge_padding_high(4);
   dimension->set_interior_padding(7);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad1DS3Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -99,21 +123,21 @@ XLA_TEST_F(PadTest, Pad1DS3Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(1);
 
-  b.Pad(b.ConstantR1<float>({1, 2, 3}), b.ConstantR0<float>(0.1),
-        padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({1, 2, 3}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   std::vector<float> expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3});
-  ComputeAndCompareR1<float>(&b, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad4D_2x0x3x2_FloatArray) {
+XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
   ComputationBuilder b(client_, TestName());
-  b.Pad(b.ConstantR4FromArray4D<float>(Array4D<float>(2, 0, 3, 2)),
-        b.ConstantR0<float>(1.5), r4_padding_on_dim0_dim1_);
+  b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
+        AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
+TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   ComputationBuilder b(client_, TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
@@ -123,7 +147,7 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<float>(*input), b.ConstantR0<float>(1.5),
+  b.Pad(AddParam(*input, &b), AddParam(*Literal::CreateR0<float>(1.5), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(2, 3, 3, 2);
@@ -134,15 +158,15 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   (*expected)(1, 0, 1, 1) = 4.0f;
   (*expected)(1, 0, 2, 0) = 5.0f;
   (*expected)(1, 0, 2, 1) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
   ComputationBuilder b(client_, TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
-  b.Pad(b.ConstantR4FromArray4D<float>(input), b.ConstantR0<float>(pad_value),
+  b.Pad(AddParam(input, &b), AddParam(*Literal::CreateR0<float>(pad_value), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(8, 5, 1, 1);
@@ -156,7 +180,7 @@ TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
   ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
+TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -184,7 +208,8 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 1, 5, 8);
   expected_array.Fill(pad_value);
@@ -197,7 +222,7 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
+XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -229,7 +254,8 @@ XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 25, 17, 11);
   expected_array.Fill(pad_value);
@@ -249,7 +275,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<uint8>(*input), b.ConstantR0<uint8>(35),
+  b.Pad(AddParam(*input, &b), b.ConstantR0<uint8>(35),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<uint8>>(2, 3, 3, 2);
@@ -277,8 +303,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   auto ones = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   zeros->Fill(0);
   ones->Fill(1);
-  b.Select(padded, b.ConstantR4FromArray4D<int32>(*ones),
-           b.ConstantR4FromArray4D<int32>(*zeros));
+  b.Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b));
 
   auto expected = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   expected->Fill(0);
@@ -291,10 +316,12 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   ComputeAndCompareR4<int32>(&b, *expected, {});
 }
 
-XLA_TEST_F(PadTest, Large2DPad) {
+XLA_TEST_P(PadTestFloat, Large2DPad) {
   ComputationBuilder b(client_, TestName());
 
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {4, 4}), "input");
+  auto ones = MakeUnique<Array2D<float>>(4, 4);
+  ones->Fill(1.0f);
+  auto input = AddParam(*ones, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -302,25 +329,22 @@ XLA_TEST_F(PadTest, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(0.0f), padding_config);
-
-  auto ones = MakeUnique<Array2D<float>>(4, 4);
-  ones->Fill(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, AllTypes2DPad) {
+XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(0.0f);
+  auto input = AddParam(*operand, &b);
+
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(7);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(5);
@@ -328,20 +352,14 @@ XLA_TEST_F(PadTest, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, b.ConstantR0<float>(3.14f), padding_config);
-
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(0.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec{0.0001});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, High2DPad) {
+XLA_TEST_P(PadTestFloat, High2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -349,8 +367,9 @@ XLA_TEST_F(PadTest, High2DPad) {
   constexpr int64 low_padding = 0;
   int64 high_padding[2] = {5, 7};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(low_padding);
@@ -359,20 +378,15 @@ XLA_TEST_F(PadTest, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativePadding2D) {
+XLA_TEST_P(PadTestFloat, NegativePadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -380,8 +394,9 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
   int64 low_padding[2] = {-1, -2};
   int64 high_padding[2] = {-3, 4};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -391,20 +406,15 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
+XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 8;
@@ -412,8 +422,9 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
   int64 low_padding[2] = {4, -1};
   int64 high_padding[2] = {-2, -4};
   int64 interior_padding[2] = {1, 2};
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -423,44 +434,40 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 // Regression test for b/31827337.
-XLA_TEST_F(PadTest, ReducePad) {
+XLA_TEST_P(PadTestFloat, ReducePad) {
   ComputationBuilder b(client_, TestName());
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2, 2}), "input");
+  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
+  ones->Fill(1.0);
+  auto input = AddParam(*ones, &b);
 
-  Computation add_f32 = CreateScalarAddComputation(F32, &b);
-  auto reduce = b.Reduce(input, b.ConstantR0<float>(0.0), add_f32, {0});
+  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  auto reduce =
+      b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto pad = b.Pad(reduce, b.ConstantR0<float>(0.0), padding_config);
-
-  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
-  ones->Fill(1.0);
-  auto input_literal = Literal::CreateR4FromArray4D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{0.0, 0.0}, {0.0, 0.0}}});
-  ComputeAndCompareR3<float>(&b, expected, {input_data.get()});
+  ComputeAndCompareR3<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
+INSTANTIATE_TEST_CASE_P(PadTestFloatInstantiation, PadTestFloat,
+                        ::testing::ValuesIn(use_bfloat16_params));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0a2c0ca4cb8414e0771a541b9f963f9aedc8376
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
@@ -0,0 +1,132 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+// Tests the Reduce HLO in ways that can't be done using the ComputationBuilder
+// API.
+
+namespace xla {
+namespace {
+
+namespace str_util = tensorflow::str_util;
+namespace strings = tensorflow::strings;
+
+struct ReduceLayout {
+  std::array<int64, 4> input_minor_to_major;
+  std::array<int64, 3> output_minor_to_major;
+
+  string ToString() const {
+    return strings::StrCat(str_util::Join(input_minor_to_major, "x"), "_",
+                           str_util::Join(output_minor_to_major, "x"));
+  }
+};
+
+string PrintReduceLayout(
+    ::testing::TestParamInfo<ReduceLayout> reduce_layout_param) {
+  return reduce_layout_param.param.ToString();
+}
+
+void PrintTo(const ReduceLayout& reduce_layout, ::std::ostream* os) {
+  *os << reduce_layout.ToString();
+}
+
+class ReduceWithLayoutTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<ReduceLayout> {};
+
+StatusOr<std::unique_ptr<HloModule>> GetParsedModule() {
+  const char* const hlo_string = R"(
+HloModule BadReduce
+
+Sum {
+  x.1 = f32[] parameter(0)
+  y.1 = f32[] parameter(1)
+  ROOT add.1 = f32[] add(x.1, y.1)
+}
+
+ENTRY reduce.1 {
+  parameter = f32[2,2,2,3]{3,2,1,0} parameter(0)
+  init_value = f32[] constant(0)
+  reduce = f32[2,2,3]{2,1,0} reduce(parameter, init_value), dimensions={1}, to_apply=Sum
+  ROOT copy = f32[2,2,3]{2,1,0} copy(reduce)
+}
+)";
+
+  return tools::Parse(hlo_string);
+}
+
+// TODO(b/72454718): XLA:GPU does not support executing code compiled without
+// optimizations.
+XLA_TEST_P(ReduceWithLayoutTest, DISABLED_ON_GPU(Reduce)) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module, GetParsedModule());
+  HloInstruction* reduce_instruction =
+      module->entry_computation()->root_instruction()->mutable_operand(0);
+  ASSERT_EQ(reduce_instruction->opcode(), HloOpcode::kReduce);
+
+  const ReduceLayout& reduce_layout = GetParam();
+
+  Shape* reduce_output_shape = reduce_instruction->mutable_shape();
+  *reduce_output_shape->mutable_layout() =
+      LayoutUtil::MakeLayout(reduce_layout.output_minor_to_major);
+
+  Shape* reduce_input_shape =
+      reduce_instruction->mutable_operand(0)->mutable_shape();
+  *reduce_input_shape->mutable_layout() =
+      LayoutUtil::MakeLayout(reduce_layout.input_minor_to_major);
+
+  std::unique_ptr<Literal> reduce_input =
+      Literal::CreateR4<float>({{ /*i0=0*/
+                                 {/*i1=0*/
+                                  {-0.246092796, -0.179497838, -0.161181688},
+                                  {-0.151643038, -0.240213156, -0.198156}},
+                                 {/*i1=1*/
+                                  {-0.14222312, -0.162200093, -0.193907976},
+                                  {-0.239411, -0.198166847, -0.172471642}}},
+                                { /*i0=1*/
+                                 {/*i1=0*/
+                                  {-0.22965157, -0.218723893, -0.129257083},
+                                  {-0.188762426, -0.16123569, -0.181166649}},
+                                 {/*i1=1*/
+                                  {-0.241772294, -0.245131493, -0.160247207},
+                                  {-0.179881215, -0.23383224, -0.121976733}}}});
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec(1e-5)));
+}
+
+INSTANTIATE_TEST_CASE_P(ReduceWithLayoutTest_Instantiation,
+                        ReduceWithLayoutTest,
+                        ::testing::Values(                           //
+                            ReduceLayout{{3, 2, 1, 0}, {0, 1, 2}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {0, 2, 1}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {1, 2, 0}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {1, 0, 2}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {2, 0, 1}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {2, 1, 0}},   //
+                            ReduceLayout{{3, 1, 2, 0}, {1, 2, 0}},   //
+                            ReduceLayout{{1, 2, 3, 0}, {1, 0, 2}},   //
+                            ReduceLayout{{0, 2, 1, 3}, {2, 0, 1}}),  //
+                        PrintReduceLayout);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 4756ba096896806ece8fe35d18c4eaef041b8830..dc7ce3253cee255a7949326fa5b49fc8917432b8 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -249,7 +249,9 @@ INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest,
 // ReducePrecisionInsertion passes.
 class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -276,7 +278,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
   ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -300,7 +304,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -322,7 +328,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -345,7 +353,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index a766fa2db0e193c52171490981855843ab3ee158..50d7b5074d201d2292cf90224ef4cd37efdbb8d3 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -494,6 +494,26 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
                              ErrorSpec(0.01, 1e-4));
 }
 
+// Test that algebraic simplifier does not incorrectly fold a transpose into a
+// reduction operation.
+XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
+  ComputationBuilder builder(client_, TestName());
+  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
+  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
+  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
+  ComputationDataHandle transpose =
+      builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  ComputationDataHandle reduce =
+      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
+                          MakeFakeLiteral(input_shape));
+
+  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
+                    ErrorSpec(0.01, 1e-4));
+}
+
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 73b37e201afa13546179e2ce7a76d3f7967de524..7f3c72671d51bcdfba89b050371626d672c6d945 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -1016,37 +1016,39 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
                                ::testing::tuple<R2ReduceWindowTestData, bool>> {
  protected:
   R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
-};
 
-TEST_P(R2ReduceWindowTest, Add) {
-  ComputationBuilder b(client_, TestName());
-  const auto& param = ::testing::get<0>(GetParam());
-  CHECK(param.reducer == kAdd);
-
-  const float kInitValue = 0.0f;
-  Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
-  std::unique_ptr<Literal> input_literal =
-      Literal::CreateR2FromArray2DWithLayout(
-          input, LayoutUtil::MakeLayout(param.layout));
+  void DoIt() {
+    ComputationBuilder b(client_, TestName());
+    const auto& param = ::testing::get<0>(GetParam());
+    CHECK(param.reducer == kAdd);
 
-  ComputationDataHandle parameter;
-  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
-                                                     &b, &parameter);
-  auto init_value =
-      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
-  b.ReduceWindow(/*operand=*/parameter,
-                 /*init_value=*/init_value,
-                 /*computation=*/CreateScalarAddComputation(FloatType(), &b),
-                 /*window_dimensions=*/param.window_bounds,
-                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+    const float kInitValue = 0.0f;
+    Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateR2FromArray2DWithLayout(
+            input, LayoutUtil::MakeLayout(param.layout));
 
-  auto expected = ReferenceUtil::ReduceWindow2DAdd(
-      /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
-      /*stride=*/param.strides, /*padding=*/param.padding);
+    ComputationDataHandle parameter;
+    auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                       &b, &parameter);
+    auto init_value =
+        CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+    b.ReduceWindow(/*operand=*/parameter,
+                   /*init_value=*/init_value,
+                   /*computation=*/CreateScalarAddComputation(FloatType(), &b),
+                   /*window_dimensions=*/param.window_bounds,
+                   /*window_strides=*/param.strides, /*padding=*/param.padding);
+
+    auto expected = ReferenceUtil::ReduceWindow2DAdd(
+        /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
+        /*stride=*/param.strides, /*padding=*/param.padding);
+
+    ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                             {input_arg.get()}, DefaultErrorSpec());
+  }
+};
 
-  ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
-                           {input_arg.get()}, DefaultErrorSpec());
-}
+TEST_P(R2ReduceWindowTest, DoIt) { DoIt(); }
 
 INSTANTIATE_TEST_CASE_P(
     R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
@@ -1054,6 +1056,26 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::ValuesIn(use_bfloat16_params)),
     R2ReduceWindowTestDataToString);
 
+class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {};
+
+// TODO(b/72234705): Fix the test cases failed on CPU and GPU.
+XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test,
+           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+  DoIt();
+}
+
+const R2ReduceWindowTestData kR2FailingValuesCpuGpuBf16Test[] = {
+    {/*base_bounds=*/{8, 128}, /*window_bounds=*/{8, 128},
+     /*strides=*/{1, 1}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+};
+
+INSTANTIATE_TEST_CASE_P(
+    R2ReduceWindowFailingInstantiation, R2ReduceWindowFailingCpuGpuBf16Test,
+    ::testing::Combine(::testing::ValuesIn(kR2FailingValuesCpuGpuBf16Test),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R2ReduceWindowTestDataToString);
+
 struct R1ReduceWindowTestData {
   int64 base_bounds[1];
   int64 window_bounds[1];
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index debf2d2d317fe64ca1ef86cb1f2978e76af1b55d..43e4d891a1319ceed92d1bf62d8b25c26a6ef317 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -852,5 +852,12 @@ XLA_TEST_F(ScalarComputationsTest, SqrtF320) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(ScalarComputationsTest, RoundScalar) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Round(builder.ConstantR0<float>(1.4f));
+
+  ComputeAndCompareR0<float>(&builder, 1.0f, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index 62ff349e9c011e0eb845192013a74aeb0956b791..9ee94b8571e5fc8789b60501462986967ce909a0 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -39,8 +39,8 @@ namespace xla {
 namespace {
 
 struct SelectAndScatterTestParam {
-  Array4D<float> operand_shape;
-  Array4D<float> source_shape;
+  std::vector<int64> operand_shape;
+  std::vector<int64> source_shape;
   Padding padding_type;
   tensorflow::gtl::ArraySlice<int64> window_dimensions;
   tensorflow::gtl::ArraySlice<int64> window_strides;
@@ -69,83 +69,132 @@ class SelectAndScatterTest
   Computation min_f32_;
 };
 
-XLA_TEST_P(SelectAndScatterTest, R4Randomized) {
-  Array4D<float> o(GetParam().operand_shape);
+XLA_TEST_P(SelectAndScatterTest, ParamTest) {
+  auto operand_shape = GetParam().operand_shape;
+  Array<float> o(operand_shape);
   o.FillRandom(1.5f);
-  auto operand = builder_.ConstantR4FromArray4D(o);
+  auto operand = builder_.ConstantFromArray(o);
 
-  Array4D<float> s(GetParam().source_shape);
+  auto source_shape = GetParam().source_shape;
+  Array<float> s(source_shape);
   s.FillRandom(12.0f);
-  auto source = builder_.ConstantR4FromArray4D(s);
-
-  builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions,
-                            GetParam().window_strides, GetParam().padding_type,
-                            source, builder_.ConstantR0<float>(0.0f), add_f32_);
+  auto source = builder_.ConstantFromArray(s);
 
-  auto e = ReferenceUtil::SelectAndScatter4DGePlus(
-      o, s, 0.0f, GetParam().window_dimensions, GetParam().window_strides,
-      GetParam().padding_type == Padding::kSame);
+  auto select_and_scatter = builder_.SelectAndScatter(
+      operand, ge_f32_, GetParam().window_dimensions, GetParam().window_strides,
+      GetParam().padding_type, source, builder_.ConstantR0<float>(0.0f),
+      add_f32_);
 
-  ComputeAndCompareR4<float>(&builder_, *e, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, select_and_scatter, {}, ErrorSpec(1e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
     SelectAndScatterTest_Instantiation, SelectAndScatterTest,
-    ::testing::Values(SelectAndScatterTestParam{{6, 6, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kSame,
-                                                {3, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{7, 7, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{6, 7, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kValid,
-                                                {2, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{6, 7, 256, 128},
-                                                {2, 3, 256, 128},
-                                                Padding::kValid,
-                                                {2, 3, 1, 1},
-                                                {3, 2, 1, 1}},
-                      SelectAndScatterTestParam{{9, 9, 16, 128},
-                                                {3, 3, 16, 128},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{3, 3, 4, 4},
-                                                {1, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{3, 3, 4, 4},
-                                                {1, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{9, 3, 4, 4},
-                                                {3, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{7, 3, 4, 4},
-                                                {3, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {2, 3, 1, 1}},
-                      SelectAndScatterTestParam{{1, 1, 5, 5},
-                                                {1, 1, 5, 5},
-                                                Padding::kSame,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{7, 7, 8, 256},
-                                                {4, 4, 8, 256},
-                                                Padding::kSame,
-                                                {2, 2, 1, 1},
-                                                {2, 2, 1, 1}}));
+    ::testing::Values(
+        SelectAndScatterTestParam{{6, 6, 6, 4, 4},
+                                  {3, 3, 3, 4, 4},
+                                  Padding::kSame,
+                                  {3, 3, 3, 1, 1},
+                                  {2, 2, 2, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 7, 4, 4},
+                                  {3, 3, 3, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 3, 1, 1},
+                                  {2, 2, 2, 1, 1}},
+
+        SelectAndScatterTestParam{{8, 8, 8, 4, 4},
+                                  {1, 3, 3, 4, 4},
+                                  Padding::kValid,
+                                  {8, 4, 4, 1, 1},
+                                  {1, 2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 6, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kSame,
+                                  {3, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 7, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 7, 256, 128},
+                                  {2, 3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 3, 1, 1},
+                                  {3, 2, 1, 1}},
+        SelectAndScatterTestParam{{9, 9, 16, 128},
+                                  {3, 3, 16, 128},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{3, 3, 4, 4},
+                                  {1, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{3, 3, 4, 4},
+                                  {1, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{9, 3, 4, 4},
+                                  {3, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{7, 3, 4, 4},
+                                  {3, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {2, 3, 1, 1}},
+        SelectAndScatterTestParam{{1, 1, 5, 5},
+                                  {1, 1, 5, 5},
+                                  Padding::kSame,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 8, 256},
+                                  {4, 4, 8, 256},
+                                  Padding::kSame,
+                                  {2, 2, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{
+            {6, 4, 4}, {3, 4, 4}, Padding::kSame, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{
+            {6, 256, 128}, {3, 256, 128}, Padding::kSame, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{{7, 256, 128},
+                                  {3, 256, 128},
+                                  Padding::kValid,
+                                  {3, 1, 1},
+                                  {2, 1, 1}},
+        SelectAndScatterTestParam{{6, 256, 128},
+                                  {3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 1, 1},
+                                  {2, 1, 1}},
+        SelectAndScatterTestParam{{6, 256, 128},
+                                  {2, 256, 128},
+                                  Padding::kValid,
+                                  {2, 1, 1},
+                                  {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {9, 16, 128}, {3, 16, 128}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {3, 4, 4}, {1, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {3, 4, 4}, {1, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {9, 4, 4}, {3, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {7, 4, 4}, {3, 4, 4}, Padding::kValid, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{
+            {1, 5, 5}, {1, 5, 5}, Padding::kSame, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {7, 8, 256}, {4, 8, 256}, Padding::kSame, {2, 1, 1}, {2, 1, 1}}));
 
 // Test for F32 1D array, with a zero-element input.
 XLA_TEST_F(SelectAndScatterTest, R1S0F32) {
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0e90a323583de7336556c203a4b46fc14b53454d..b060fb13b1451aab30cfca73bea0a4a598a9fa3a 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -24,51 +24,127 @@ namespace xla {
 namespace {
 
 template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal) {
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
-  std::minstd_rand0 engine;
-  // Create uniform numbers between 1 and 1.125 ot avoid creating denormal
+  // Create uniform numbers between 1 and 1.125 to avoid creating denormal
   // numbers.
   std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
-        // Generate a random uniforma number from -0.0625 and 0.0625 and bias it
-        // with  a position dependent number with mean 0.037109375. These number
+        // Generate a random uniform number from -0.0625 and 0.0625 and bias it
+        // with a position dependent number with mean 0.037109375. These number
         // should allow for long chains of accumulation without being too close
-        // to zero or to large to accumulate all numbers accurately.
-        return (generator(engine) - 1.0625) +
-               static_cast<FloatT>(Product(indices) % 113 - 47) /
-                   static_cast<FloatT>(256.0f);
+        // to zero or too large to accumulate all numbers accurately. Only do
+        // this for large literals where the number of elements is much greater
+        // than 47 otherwise only negative values are produced.
+        //
+        // The value is positionally biased using a product of the indices. Add
+        // one to each index value to avoid collapsing to zero if any of the
+        // indices are zero.
+        int64 index_product = 1;
+        for (int64 i : indices) {
+          index_product *= (1 + i);
+        }
+        const int64 negative_bias = should_index_bias ? 47 : 0;
+        FloatT index_bias =
+            static_cast<FloatT>(index_product % 113 - negative_bias) /
+            static_cast<FloatT>(256.0f);
+        return (generator(*engine) - 1.0625) + index_bias;
       }));
 }
 
 // The standard library does not have a case for bfloat16, unsurprisingly, so we
 // handle that one specially.
 template <>
-void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal) {
+void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
+                                                   std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(), BF16);
-  std::minstd_rand0 engine;
   std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
   TF_CHECK_OK(literal->Populate<bfloat16>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return static_cast<bfloat16>(generator(engine));
+        return static_cast<bfloat16>(generator(*engine));
       }));
 }
 
 template <typename IntT>
-void PopulateWithRandomIntegralData(Literal* literal) {
+void PopulateWithRandomIntegralData(Literal* literal,
+                                    std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
-  std::minstd_rand0 engine;
   std::uniform_int_distribution<IntT> generator(
       std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
   TF_CHECK_OK(literal->Populate<IntT>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(engine);
+        return generator(*engine);
       }));
 }
 
+// Similar to MakeFakeLiteral but takes a random number generator engine to
+// enable reusing the engine across randomly generated literals.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
+    const Shape& shape, std::minstd_rand0* engine) {
+  if (ShapeUtil::IsTuple(shape)) {
+    std::vector<std::unique_ptr<Literal>> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
+                          MakeFakeLiteralInternal(element_shape, engine));
+      elements.push_back(std::move(element));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  }
+  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  switch (shape.element_type()) {
+    case BF16:
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
+      break;
+    case F32:
+      PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
+      break;
+    case F64:
+      PopulateWithRandomFloatingPointData<double>(literal.get(), engine);
+      break;
+    case S8:
+      PopulateWithRandomIntegralData<int8>(literal.get(), engine);
+      break;
+    case U8:
+      PopulateWithRandomIntegralData<uint8>(literal.get(), engine);
+      break;
+    case S16:
+      PopulateWithRandomIntegralData<int16>(literal.get(), engine);
+      break;
+    case U16:
+      PopulateWithRandomIntegralData<uint16>(literal.get(), engine);
+      break;
+    case S32:
+      PopulateWithRandomIntegralData<int32>(literal.get(), engine);
+      break;
+    case U32:
+      PopulateWithRandomIntegralData<uint32>(literal.get(), engine);
+      break;
+    case S64:
+      PopulateWithRandomIntegralData<int64>(literal.get(), engine);
+      break;
+    case U64:
+      PopulateWithRandomIntegralData<uint64>(literal.get(), engine);
+      break;
+    case PRED: {
+      std::uniform_int_distribution<int> generator(0, 1);
+      TF_CHECK_OK(literal->Populate<bool>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(*engine);
+          }));
+      break;
+    }
+    default:
+      return Unimplemented("Unsupported type for fake literal generation: %s",
+                           ShapeUtil::HumanString(shape).c_str());
+  }
+  return std::move(literal);
+}
+
 // Matches binary addition computations.
 bool LooksLikeSum(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
@@ -95,15 +171,15 @@ bool NeedsZeroInitValue(const HloUse& use) {
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
 std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
-    const Shape& input_shape, const Shape& slice_shape) {
+    const Shape& input_shape, const Shape& slice_shape,
+    std::minstd_rand0* engine) {
   const int64 rank = ShapeUtil::Rank(input_shape);
   std::vector<int32> start_indices(rank);
-  std::minstd_rand0 engine;
   for (int i = 0; i < rank; ++i) {
     const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
                               ShapeUtil::GetDimension(slice_shape, i);
     std::uniform_int_distribution<int32> generator(0, upper_bound);
-    start_indices[i] = generator(engine);
+    start_indices[i] = generator(*engine);
   }
   return Literal::CreateR1<int32>(start_indices);
 }
@@ -150,7 +226,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
 // zero in the case of init_values for reductions).
 StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
-    const HloInstruction& param) {
+    const HloInstruction& param, std::minstd_rand0* engine) {
   HloInstruction* needs_index = nullptr;
   HloInstruction* needs_zero = nullptr;
   for (HloInstruction* use : constrained_uses) {
@@ -185,93 +261,39 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
   }
   if (needs_index != nullptr) {
     return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
-                                           needs_index->shape());
+                                           needs_index->shape(), engine);
   } else if (needs_zero != nullptr) {
     return Literal::CreateFromShape(param.shape());
   } else {
-    return MakeFakeLiteral(param.shape());
+    return MakeFakeLiteralInternal(param.shape(), engine);
   }
 }
 
 // Given a module entry parameter, use the dataflow analysis to see if a
 // special case literal must be created, or if we can generate fake data.
 StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
-    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param,
+    std::minstd_rand0* engine) {
   const auto constrained_uses = FindConstrainedUses(dataflow, param);
-  return CreateLiteralForConstrainedUses(constrained_uses, param);
+  return CreateLiteralForConstrainedUses(constrained_uses, param, engine);
 }
 
 }  // namespace
 
 StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
-    std::vector<std::unique_ptr<Literal>> elements;
-    for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteral(element_shape));
-      elements.push_back(std::move(element));
-    }
-    return Literal::MakeTupleOwned(std::move(elements));
-  }
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-  switch (shape.element_type()) {
-    case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(literal.get());
-      break;
-    case F32:
-      PopulateWithRandomFloatingPointData<float>(literal.get());
-      break;
-    case F64:
-      PopulateWithRandomFloatingPointData<double>(literal.get());
-      break;
-    case S8:
-      PopulateWithRandomIntegralData<int8>(literal.get());
-      break;
-    case U8:
-      PopulateWithRandomIntegralData<uint8>(literal.get());
-      break;
-    case S16:
-      PopulateWithRandomIntegralData<int16>(literal.get());
-      break;
-    case U16:
-      PopulateWithRandomIntegralData<uint16>(literal.get());
-      break;
-    case S32:
-      PopulateWithRandomIntegralData<int32>(literal.get());
-      break;
-    case U32:
-      PopulateWithRandomIntegralData<uint32>(literal.get());
-      break;
-    case S64:
-      PopulateWithRandomIntegralData<int64>(literal.get());
-      break;
-    case U64:
-      PopulateWithRandomIntegralData<uint64>(literal.get());
-      break;
-    case PRED: {
-      std::uniform_int_distribution<int> generator(0, 1);
-      std::minstd_rand0 engine;
-      TF_CHECK_OK(literal->Populate<bool>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    default:
-      return Unimplemented("Unsupported type for fake literal generation: %s",
-                           ShapeUtil::HumanString(shape).c_str());
-  }
-  return std::move(literal);
+  std::minstd_rand0 engine;
+  return MakeFakeLiteralInternal(shape, &engine);
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
     HloModule* const module) {
   TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(module));
   const auto params = module->entry_computation()->parameter_instructions();
+  std::minstd_rand0 engine;
   std::vector<std::unique_ptr<Literal>> arguments(params.size());
   for (int i = 0; i < params.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(arguments[i],
-                        MakeConstrainedArgument(*dataflow, *params[i]));
+    TF_ASSIGN_OR_RETURN(
+        arguments[i], MakeConstrainedArgument(*dataflow, *params[i], &engine));
   }
   return std::move(arguments);
 }
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index fa4192e9281784a4a3063601afe89fba6a9dac18..835e2d7e5594d7c8c6e523f9806e32dce23a87e9 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -215,5 +215,23 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
   ComputeAndCompareR2<float>(&builder, {{0, 0}, {0, 0}}, {});
 }
 
+XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<int32>({0, 1});
+  auto rhs = builder.ConstantR1<int32>({1, 1});
+  builder.ConvertElementType(builder.Eq(lhs, rhs), S32);
+
+  ComputeAndCompareR1<int32>(&builder, {0, 1}, {});
+}
+
+XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToF32) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<int32>({0, 1});
+  auto rhs = builder.ConstantR1<int32>({1, 1});
+  builder.ConvertElementType(builder.Eq(lhs, rhs), F32);
+
+  ComputeAndCompareR1<float>(&builder, {0.0, 1.0}, {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 146fbadcb68e6c5d0fa0856c1c98b399df72051f..9ad2a1985331b80625dd0687ea052300bc99e440 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -32,6 +34,7 @@ limitations under the License.
 namespace xla {
 namespace {
 namespace se = ::perftools::gputools;
+namespace gtl = ::tensorflow::gtl;
 
 class HloProfileTest : public ClientLibraryTestBase {};
 
@@ -43,39 +46,74 @@ struct ParsedProfileOutputLine {
   string trops;
   string bytes_per_sec;
   string bytes_per_cycle;
-  string name;
+  string opcode;
 };
 
-StatusOr<ParsedProfileOutputLine> ParseProfileOutputLine(const string& line,
-                                                         bool expect_flops,
-                                                         bool expect_trops) {
+::testing::AssertionResult HasFlops(
+    const ParsedProfileOutputLine& parsed_line) {
+  if (RE2::FullMatch(parsed_line.flops, "[0-9.TGMk]+FLOP/s")) {
+    return ::testing::AssertionSuccess()
+           << "'flops' field present in  " << parsed_line.opcode << ": '"
+           << parsed_line.flops << "'";
+  }
+
+  return ::testing::AssertionFailure()
+         << "'flops' field absent in  " << parsed_line.opcode << ": '"
+         << parsed_line.flops << "'";
+}
+
+::testing::AssertionResult HasTrops(
+    const ParsedProfileOutputLine& parsed_line) {
+  if (RE2::FullMatch(parsed_line.trops, "[0-9.TGMk]+TROP/s")) {
+    return ::testing::AssertionSuccess()
+           << "'trops' field present in  " << parsed_line.opcode << ": '"
+           << parsed_line.trops << "'";
+  }
+
+  return ::testing::AssertionFailure()
+         << "'trops' field absent in  " << parsed_line.opcode << ": '"
+         << parsed_line.trops << "'";
+}
+
+Status ParseOneProfileOutputLine(
+    const string& line, bool expect_hlo,
+    gtl::FlatMap<string, ParsedProfileOutputLine>* parsed_results) {
   string separator = "[^:]*:: +";
   string match_percentage = "\\d+\\.\\d\\d%";
   string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)";
   string match_usecs = "([0-9.]+) usec";
-  string match_flops = expect_flops ? "([0-9.TGMk]+)FLOP/s" : "(<none>)";
-  string match_trops = expect_trops ? "([0-9.TGMk]+)TROP/s" : "(<none>)";
+  string match_flops = "([^ ]+)";
+  string match_trops = "([^ ]+)";
   string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
   string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
+
+  // The underlined part is what we're trying to match with match_opcode:
+  //
+  //   %dot33 = f32[256,256]{1,0} dot(...)
+  //                              ^^^
+
+  string match_opcode =
+      expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])";
   string regexp_pattern = tensorflow::strings::StrCat(
       " +", match_cycles, separator, match_usecs, separator, match_flops,
       separator, match_trops, separator, match_bytes_per_sec, separator,
-      match_bytes_per_cycle, separator, "(.*)");
+      match_bytes_per_cycle, separator, match_opcode);
 
-  RE2 pattern(regexp_pattern);
   ParsedProfileOutputLine parsed_line;
   bool matched = RE2::FullMatch(
-      line, pattern, &parsed_line.cycles, &parsed_line.cycles_percentage,
+      line, regexp_pattern, &parsed_line.cycles, &parsed_line.cycles_percentage,
       &parsed_line.usec, &parsed_line.flops, &parsed_line.trops,
       &parsed_line.bytes_per_sec, &parsed_line.bytes_per_cycle,
-      &parsed_line.name);
+      &parsed_line.opcode);
   if (!matched) {
     return tensorflow::errors::InvalidArgument(
         "Input did not match regexp.  Input: ", line,
         ", Regexp: ", regexp_pattern);
   }
 
-  return parsed_line;
+  InsertOrDie(parsed_results, parsed_line.opcode, parsed_line);
+
+  return Status::OK();
 }
 
 // Returns void so that we can ASSERT.
@@ -110,7 +148,8 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
 
   Executable* executable = local_executable->executable();
   HloExecutionProfile hlo_execution_profile(
-      &executable->hlo_profile_printer(), &executable->hlo_profile_index_map());
+      &executable->hlo_profile_printer_data(),
+      &executable->hlo_profile_index_map());
 
   TF_ASSERT_OK_AND_ASSIGN(
       Backend::StreamPtr stream_ptr,
@@ -147,7 +186,7 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
                           ClientLibrary::GetOrCreateLocalClient(platform));
 
   ComputationBuilder builder(client, TestName());
-  auto result = builder.Tanh(builder.Dot(
+  auto result = builder.Tanh(builder.Add(
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
       builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
 
@@ -160,31 +199,43 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
   std::vector<string> profile_output_lines =
       tensorflow::str_util::Split(profile_output, '\n');
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine total_profile,
-      ParseProfileOutputLine(profile_output_lines[1], /*expect_flops=*/true,
-                             /*expect_trops=*/true));
+  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine dot_profile,
-      ParseProfileOutputLine(profile_output_lines[2], /*expect_flops=*/true,
-                             /*expect_trops=*/false));
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[1], /*expect_hlo=*/false, &parsed_profile_lines));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine tanh_profile,
-      ParseProfileOutputLine(profile_output_lines[3], /*expect_flops=*/false,
-                             /*expect_trops=*/true));
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[2], /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[3], /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile,
+                          MaybeFind(parsed_profile_lines, "[total]"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
+                          MaybeFind(parsed_profile_lines, "add"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine tanh_profile,
+                          MaybeFind(parsed_profile_lines, "tanh"));
 
   EXPECT_GT(total_profile.cycles, 0);
   EXPECT_EQ(total_profile.cycles_percentage, "100.00%");
 
+  EXPECT_TRUE(HasFlops(total_profile));
+  EXPECT_TRUE(HasTrops(total_profile));
+
   EXPECT_GT(total_profile.cycles, dot_profile.cycles);
   EXPECT_NE(dot_profile.cycles_percentage, "0.00%");
   EXPECT_NE(dot_profile.cycles_percentage, "100.00%");
 
+  EXPECT_TRUE(HasFlops(dot_profile));
+  EXPECT_FALSE(HasTrops(dot_profile));
+
   EXPECT_GT(total_profile.cycles, tanh_profile.cycles);
   EXPECT_NE(tanh_profile.cycles_percentage, "0.00%");
   EXPECT_NE(tanh_profile.cycles_percentage, "100.00%");
+
+  EXPECT_FALSE(HasFlops(tanh_profile));
+  EXPECT_TRUE(HasTrops(tanh_profile));
 }
 
 // TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
@@ -219,7 +270,7 @@ XLA_TEST_F(HloProfileTest,
     auto matrix = builder.GetTupleElement(state, 1);
     auto next_iteration = builder.Add(builder.GetTupleElement(state, 0),
                                       builder.ConstantR0<int32>(1));
-    builder.Tuple({next_iteration, builder.Dot(matrix, matrix)});
+    builder.Tuple({next_iteration, builder.Add(matrix, matrix)});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
@@ -248,20 +299,23 @@ XLA_TEST_F(HloProfileTest,
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.end());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine total_while_body_profile,
-      ParseProfileOutputLine(*std::next(while_body_profile_start, 1),
-                             /*expect_flops=*/false,
-                             /*expect_trops=*/false));
+  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine dot_profile,
-      ParseProfileOutputLine(*std::next(while_body_profile_start, 2),
-                             /*expect_flops=*/false,
-                             /*expect_trops=*/false));
+  TF_ASSERT_OK(
+      ParseOneProfileOutputLine(*std::next(while_body_profile_start, 1),
+                                /*expect_hlo=*/false, &parsed_profile_lines));
+
+  TF_ASSERT_OK(
+      ParseOneProfileOutputLine(*std::next(while_body_profile_start, 2),
+                                /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile,
+                          MaybeFind(parsed_profile_lines, "[total]"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
+                          MaybeFind(parsed_profile_lines, "add"));
 
   EXPECT_GT(total_while_body_profile.cycles, 0);
-  EXPECT_EQ(total_while_body_profile.name, "[total]");
+  EXPECT_EQ(total_while_body_profile.opcode, "[total]");
   EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%");
 
   EXPECT_GT(total_while_body_profile.cycles, dot_profile.cycles);
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 5ede37b8737bd4fa6235464ddeb6382af17c8a80..b82f1c81c84b487c1661af5267b9123da97bb107 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -85,10 +85,12 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     for (int i = 0; i < program_shape->parameters_size(); ++i) {
       layouts.push_back(&program_shape->parameters(i));
     }
+    ExecutableBuildOptions build_options;
+    build_options.set_device_ordinal(0);
+    build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
         local_service->CompileExecutable(computation.handle(), layouts,
-                                         &program_shape->result(),
-                                         /*device_ordinal=*/0);
+                                         build_options);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 24417a0cb8212e59cc0af53bd5bb21afcf3e134b..05c0fdf97d27c09eb2bbb0f265b5b2a5982ca7b1 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -60,10 +60,13 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       for (int i = 0; i < program_shape->parameters_size(); ++i) {
         layouts.push_back(&program_shape->parameters(i));
       }
+
+      ExecutableBuildOptions build_options;
+      build_options.set_device_ordinal(0);
+      build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
           local_service->CompileExecutable(computation.handle(), layouts,
-                                           &program_shape->result(),
-                                           /*device_ordinal=*/0);
+                                           build_options);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index 4e02e17db65c0a4220672733be8319e1a0cc4f0f..8460ae3e4991ee091af72d2553a8491f627c722e 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -19,7 +19,7 @@ limitations under the License.
 //
 // Reads one serilized Hlo module, convert it into JSON format and dump into
 // some output directory. some_binaray_proto is obtained by serializing Hlo
-// module to disk using --xla_dump_hlo_proto_to debug optoin.
+// module to disk using --xla_dump_optimized_hlo_proto_to debug option.
 
 #include <stdio.h>
 #include <string>
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 42e7f91f26f3454b247d95d328c3422c44131c43..d9c4d094b8ea3b4de9164f35d670fc686c921a27 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -2173,7 +2173,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 //
 //  {[2:3:4], [5:6:7], [8:9]}
 //
-// The the parsed result will be:
+// The parsed result will be:
 //
 //  {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}}
 //
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index fe5d29a6b655a89d559eb1214c2b8dd54d34094c..1f0c626bbb2d64ef4e67c9ec51485ae96ae73d04 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -30,9 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stacktrace.h"
 
 namespace xla {
-namespace {
 
-// Logs the provided status message with a backtrace.
 Status WithLogBacktrace(const Status& status) {
   CHECK(!status.ok());
   VLOG(1) << status.ToString();
@@ -40,8 +38,6 @@ Status WithLogBacktrace(const Status& status) {
   return status;
 }
 
-}  // namespace
-
 ScopedLoggingTimer::ScopedLoggingTimer(const string& label, bool enabled)
     : enabled(enabled), label(label) {
   if (enabled) {
@@ -74,13 +70,18 @@ Status AppendStatus(Status prior, tensorflow::StringPiece context) {
 // Implementation note: we can't common these out (without using macros) because
 // they all need to va_start/va_end their varargs in their frame.
 
-Status InvalidArgument(const char* format, ...) {
+Status InvalidArgumentV(const char* format, va_list args) {
   string message;
+  tensorflow::strings::Appendv(&message, format, args);
+  return WithLogBacktrace(tensorflow::errors::InvalidArgument(message));
+}
+
+Status InvalidArgument(const char* format, ...) {
   va_list args;
   va_start(args, format);
-  tensorflow::strings::Appendv(&message, format, args);
+  Status result = InvalidArgumentV(format, args);
   va_end(args);
-  return WithLogBacktrace(tensorflow::errors::InvalidArgument(message));
+  return result;
 }
 
 Status Unimplemented(const char* format, ...) {
@@ -338,7 +339,7 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 
 string SanitizeFileName(string file_name) {
   for (char& c : file_name) {
-    if (c == '/' || c == '\\' || c == '[' || c == ']') {
+    if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
       c = '_';
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index bb2db2010c5e0da6ed3fde628eb5928d555815b2..08df5b12b3a53a138f56705531baa3333b23c5d8 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -40,6 +40,13 @@ limitations under the License.
 
 namespace xla {
 
+// Logs the provided status message with a backtrace.
+//
+// For use by Status-factories, logs a backtrace at the point where the status
+// is created, such that we can use --vmodule=util=1 to see all status
+// creation backtraces.
+Status WithLogBacktrace(const Status& status);
+
 // Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
 // the bounds and indices. And for the rare cases of ranks greater than 8,
 // the InlinedVector will just behave like an std::vector<> and allocate the
@@ -207,6 +214,27 @@ Status ResourceExhausted(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 Status NotFound(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 
+// Passed-varargs variant of the InvalidArgument factory above.
+Status InvalidArgumentV(const char* format, va_list args);
+
+template <typename... Args>
+Status UnimplementedStrCat(Args&&... concat) {
+  return Unimplemented(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status InternalErrorStrCat(Args&&... concat) {
+  return InternalError(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status ResourceExhaustedStrCat(Args&&... concat) {
+  return ResourceExhausted(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
 // Splits the lines of the original, replaces leading whitespace with the prefix
 // given by "indentation", and returns the string joined by newlines again. As a
 // side effect, any additional trailing whitespace is removed.
@@ -332,7 +360,7 @@ T CeilOfRatio(T dividend, T divisor) {
 }
 
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
-// then multiplying by the divisor. For example: RoundUpToMultiple(13, 8) => 16
+// then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
 T RoundUpToNearest(T value, T divisor) {
   return CeilOfRatio(value, divisor) * divisor;
@@ -340,7 +368,7 @@ T RoundUpToNearest(T value, T divisor) {
 
 // Rounds the value down to a multiple of the divisor by first calling
 // FloorOfRatio then multiplying by the divisor. For example:
-// RoundUpToMultiple(13, 8) => 8
+// RoundDownToNearest(13, 8) => 8
 template <typename T>
 T RoundDownToNearest(T value, T divisor) {
   return FloorOfRatio(value, divisor) * divisor;
@@ -398,13 +426,11 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 // Removes illegal characters from filenames.
 string SanitizeFileName(string file_name);
 
-// Simple wrapper around std::all_of.
 template <typename Container, typename Predicate>
 bool c_all_of(Container container, Predicate predicate) {
   return std::all_of(std::begin(container), std::end(container), predicate);
 }
 
-// Simple wrapper around std::transform.
 template <typename InputContainer, typename OutputIterator,
           typename UnaryOperation>
 OutputIterator c_transform(InputContainer input_container,
@@ -414,7 +440,6 @@ OutputIterator c_transform(InputContainer input_container,
                         output_iterator, unary_op);
 }
 
-// Simple wrapper around std::copy_if.
 template <class InputContainer, class OutputIterator, class UnaryPredicate>
 OutputIterator c_copy_if(InputContainer input_container,
                          OutputIterator output_iterator,
@@ -423,6 +448,11 @@ OutputIterator c_copy_if(InputContainer input_container,
                       output_iterator, predicate);
 }
 
+template <class InputContainer, class Comparator>
+void c_sort(InputContainer& input_container, Comparator comparator) {
+  std::sort(input_container.begin(), input_container.end(), comparator);
+}
+
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index e1ed08c8480fa73e9c5ff914bb9f5e38f1ce96e9..56162ab44e2e0e3e4478fe631888f243332dc1d8 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -82,8 +82,9 @@ message DebugOptions {
   // Dump all HLO modules as text into the provided directory path.
   string xla_generate_hlo_text_to = 7;
 
-  // Dump compilation artifacts in binary proto into this directory.
-  string xla_dump_hlo_proto_to = 8;
+  // Dump Hlo after all hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_optimized_hlo_proto_to = 8;
 
   // Instrument the computation to collect per-HLO cycle counts.
   bool xla_hlo_profile = 9;
@@ -179,9 +180,13 @@ message DebugOptions {
   // ops.
   bool xla_gpu_use_cudnn_batchnorm = 94;
 
-  // Dump compilation artifacts, before hlo passes are executed, in binary proto
-  // into this directory.
-  string xla_dump_prepass_hlo_proto_to = 95;
+  // Dump HLO before any hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_unoptimized_hlo_proto_to = 95;
+
+  // Dump HLO after each pass as an HloProto in binary file format into this
+  // directory.
+  string xla_dump_per_pass_hlo_proto_to = 96;
 
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index f1e54432faa3c59ada0d89c472bcdcc28f6d0970..1c497c666bd73e0ae39b750307c25e3b28bfaf2d 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -24,6 +24,7 @@ py_library(
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/boosted_trees:init_py",
         "//tensorflow/contrib/cloud:cloud_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_ops_py",
         "//tensorflow/contrib/compiler:compiler_py",
@@ -48,6 +49,7 @@ py_library(
         "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/kafka",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/kfac",
@@ -76,6 +78,7 @@ py_library(
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
+        "//tensorflow/contrib/py2tf",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
@@ -139,6 +142,7 @@ cc_library(
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
+        "//tensorflow/contrib/kafka:kafka_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index dc5b9fb88742d78d0f40207b589e29451a6358dd..e51e3f747b57cf1c9fd79ee5cc9fdb8acee349c9 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -194,6 +194,13 @@ public class TensorFlowInferenceInterface {
    * @param outputNames A list of output nodes which should be filled by the inference pass.
    */
   public void run(String[] outputNames, boolean enableStats) {
+    run(outputNames, enableStats, new String[] {});
+  }
+
+  /**
+   * An overloaded version of runInference that allows supplying targetNodeNames as well
+   */
+  public void run(String[] outputNames, boolean enableStats, String[] targetNodeNames) {
     // Release any Tensors from the previous run calls.
     closeFetches();
 
@@ -204,6 +211,11 @@ public class TensorFlowInferenceInterface {
       runner.fetch(tid.name, tid.outputIndex);
     }
 
+	// Add targets.
+    for (String t : targetNodeNames) {
+      runner.addTarget(t);
+    }
+
     // Run the session.
     try {
       if (enableStats) {
diff --git a/tensorflow/contrib/android/jni/run_stats_jni.cc b/tensorflow/contrib/android/jni/run_stats_jni.cc
index 119fa9cd2c378d2ba2383ea8b0e09e1b6083d84e..707853b59befc2625145ad96952fbf9f66d62b43 100644
--- a/tensorflow/contrib/android/jni/run_stats_jni.cc
+++ b/tensorflow/contrib/android/jni/run_stats_jni.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/stat_summarizer.h"
 
-using tensorflow::StatSummarizer;
 using tensorflow::RunMetadata;
+using tensorflow::StatSummarizer;
 
 namespace {
 StatSummarizer* requireHandle(JNIEnv* env, jlong handle) {
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 11c3c037c4e8b4ba41eae60d28d6aac49f1488f2..6e0f0a05726a46b513a4270fd5843ff20fc95a18 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -217,6 +217,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_seed",
     ],
+    tags = ["notsan"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
index fdc12e3b21466a2c552124d6c6a339a0c25f9f46..d44fe6529a7ff0da0c6747e193fdb98a272a8da3 100644
--- a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
@@ -31,8 +31,7 @@ __all__ = [
 ]
 
 
-def custom_gradient(fx, gx, x, axis=(),
-                    fx_gx_manually_stopped=False,
+def custom_gradient(fx, gx, x, axis=(), fx_gx_manually_stopped=False,
                     name=None):
   """Enables specifying a custom gradient.
 
@@ -43,7 +42,8 @@ def custom_gradient(fx, gx, x, axis=(),
   h(x) = x * stop_gradient(g(x)) + stop_gradient(f(x) - x * g(x))
   ```
 
-  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] = stop_gradient(g(x)).`
+  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] =
+  stop_gradient(g(x)).`
 
   In addition to scalar-domain/scalar-range functions, this function also
   supports tensor-domain/scalar-range functions. However, in the latter case it
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
index 4b5d5ba0de6c3995ee2da7a44ab0ba099cbf1b35..754b7bc3270d647fc381033b769eadd7b791771e 100644
--- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -48,8 +48,9 @@ class CreateTreeEnsembleVariableOp : public OpKernel {
     if (!result->InitFromSerialized(tree_ensemble_config_t->scalar<string>()(),
                                     stamp_token)) {
       result->Unref();
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unable to parse tree ensemble config."));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unable to parse tree ensemble config."));
     }
 
     // Only create one, if one does not exist already. Report status for all
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
index f8086b0c2bb93eae6af0336bbe33fc23f8fcde22..b3fe38614e05801b223f0c96f7a70ce7e432a70b 100644
--- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -47,8 +47,8 @@ namespace boosted_trees {
 using boosted_trees::learner::LearnerConfig;
 using boosted_trees::learner::LearningRateConfig;
 using boosted_trees::learner::LearningRateDropoutDrivenConfig;
-using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::models::DecisionTreeEnsembleResource;
+using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::utils::DropoutUtils;
 using boosted_trees::utils::TensorUtils;
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 88f30064076d1b9410665e06ca27e20d14c6dde0..0f4c2298f56be48bb32f52d5d44cff8afe284f1e 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -36,13 +36,12 @@
 namespace tensorflow {
 
 using ::boosted_trees::QuantileConfig;
-using boosted_trees::utils::TensorUtils;
 using boosted_trees::QuantileStreamResource;
+using boosted_trees::utils::TensorUtils;
 
 namespace {
 const char* const kExampleWeightsName = "example_weights";
 const char* const kMaxElementsName = "max_elements";
-const char* const kHandleName = "handle";
 const char* const kNextStampTokenName = "next_stamp_token";
 const char* const kStampTokenName = "stamp_token";
 const char* const kAreBucketsReadyName = "are_buckets_ready";
@@ -52,7 +51,6 @@ const char* const kNumSparseFeaturesName = "num_sparse_features";
 const char* const kSparseBucketsName = "sparse_buckets";
 const char* const kSparseValuesName = "sparse_values";
 const char* const kSparseIndicesName = "sparse_indices";
-const char* const kSparseStreamsStateName = "sparse_streams_state";
 const char* const kSparseSummariesName = "sparse_summaries";
 const char* const kSparseConfigName = "sparse_config";
 const char* const kSparseOutputTensorName = "sparse_quantiles";
@@ -60,7 +58,6 @@ const char* const kSparseOutputTensorName = "sparse_quantiles";
 const char* const kDenseBucketsName = "dense_buckets";
 const char* const kDenseConfigName = "dense_config";
 const char* const kDenseOutputTensorName = "dense_quantiles";
-const char* const kDenseStreamsStateName = "dense_streams_state";
 const char* const kDenseSummariesName = "dense_summaries";
 const char* const kDenseValuesName = "dense_values";
 const char* const kNumDenseFeaturesName = "num_dense_features";
@@ -387,7 +384,7 @@ class MakeQuantileSummariesOp : public OpKernel {
         protobuf::Arena arena;
         ::boosted_trees::QuantileSummaryState* summary_proto =
             protobuf::Arena::CreateMessage<
-            ::boosted_trees::QuantileSummaryState>(&arena);
+                ::boosted_trees::QuantileSummaryState>(&arena);
         const auto& summary = stream.GetFinalSummary();
         CopySummaryToProto(summary, summary_proto);
         // Output to tensor.
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 18b4abd654ea3541d646a43ac901aca1a678446f..44a8ffaf4b2f5a9c11b3abc46ce55a18c80ad318 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -34,10 +34,10 @@
 
 namespace tensorflow {
 
+using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 using boosted_trees::learner::SplitInfo;
 using boosted_trees::learner::stochastic::GradientStats;
 using boosted_trees::learner::stochastic::NodeStats;
-using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 
 namespace {
 const int32 DUMMY_FEATURE_DIMENSION = -1;
@@ -47,9 +47,8 @@ class BaseBuildSplitOp : public OpKernel {
  public:
   explicit BaseBuildSplitOp(OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(
-        context,
-        context->GetAttr("feature_column_group_id", &feature_column_group_id_));
+    OP_REQUIRES_OK(context, context->GetAttr("feature_column_group_id",
+                                             &feature_column_group_id_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("l1_regularization", &l1_regularization_));
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index a9a229c8ae0c26bba5f0a684dad7e546298577bb..90a0655201f8cb8df6fc6417cb51216dec91b4d7 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -134,10 +134,9 @@ void SerializeScalarAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   // Feature ids tensor has ids of feature columns and their dimensions.
@@ -149,15 +148,14 @@ void SerializeScalarAccumulatorToOutput(
 
   Tensor* gradients_t = nullptr;
   OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_gradients", TensorShape({num_slots}),
-                               &gradients_t));
+      context, context->allocate_output(
+                   "output_gradients", TensorShape({num_slots}), &gradients_t));
   auto gradients = gradients_t->vec<float>();
 
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(
-                     "output_hessians", TensorShape({num_slots}), &hessians_t));
+  OP_REQUIRES_OK(
+      context, context->allocate_output("output_hessians",
+                                        TensorShape({num_slots}), &hessians_t));
   auto hessians = hessians_t->vec<float>();
 
   int i = 0;
@@ -177,10 +175,9 @@ void SerializeTensorAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   Tensor* feature_ids_t = nullptr;
@@ -202,9 +199,8 @@ void SerializeTensorAccumulatorToOutput(
   int64 num_hessian_elements = hessian_shape.num_elements();
   hessian_shape.InsertDim(0, num_slots);
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_hessians", hessian_shape, &hessians_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_hessians",
+                                                   hessian_shape, &hessians_t));
   auto hessians = hessians_t->flat_outer_dims<float>();
 
   int i = 0;
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
index f867e77d3ef0609774628b2a9c36ca52bcf2a957..8bca132acfde9397942b198db9a8d4c0e4d74897 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
@@ -17,8 +17,8 @@
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
 
-using tensorflow::test::AsTensor;
 using std::vector;
+using tensorflow::test::AsTensor;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index 1c4181f1b13b01f85833157e554c3b821f96ff90..8ad97fedc923ac50bcaad86e0ba2c2e46df6821b 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -15,9 +15,9 @@
 #ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 #define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 
+#include <cmath>
 #include <memory>
 #include <vector>
-#include <cmath>
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
index cbe26ba918d384ad903fb854ca3e88e84d16a923..705b65e9db9f1aed9af1be153240d57e163c2d5b 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
@@ -22,9 +22,9 @@ namespace tensorflow {
 namespace boosted_trees {
 namespace testutil {
 
+using boosted_trees::trees::DenseFloatBinarySplit;
 using tensorflow::boosted_trees::trees::DecisionTreeConfig;
 using tensorflow::boosted_trees::trees::TreeNode;
-using boosted_trees::trees::DenseFloatBinarySplit;
 
 namespace {
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
index 9de3e32b097a151b3bd6f5c30df2db0938b65e9c..609519e8b1153a27d987c5f9ca9bfcc9ee6717d6 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
@@ -25,8 +25,8 @@ namespace boosted_trees {
 namespace utils {
 namespace {
 
-using test::AsTensor;
 using errors::InvalidArgument;
+using test::AsTensor;
 
 class BatchFeaturesTest : public ::testing::Test {};
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
index 38f0151255bbf4fcd87f1d0d76fd111649ee4a12..db34db998a7442c69f2ab468f4557d991429f4ee 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
@@ -23,10 +23,10 @@
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/logging.h"
 
+using tensorflow::Status;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::Status;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
index ce7632e58987f5890beaded5dd305724f950e1e8..02f972c8e00e8229426ac53d8f20765484787b6e 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@@ -26,9 +26,9 @@
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
 
+using std::unordered_set;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig;
-using std::unordered_set;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
index bb57dcf8ae7475486bcc0fc82460cbbce9a18b68..ae99d53a2cf805d70d60746cd44f73f7fd9dc6e2 100644
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -19,8 +19,8 @@
 
 namespace tensorflow {
 namespace boosted_trees {
-using shape_inference::InferenceContext;
 using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(QuantileStreamResource);
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index 0d27ddaf3a1d540efee268c2bcca217077ff5871..5d0ebbf73ce1272b51a475f67984db3a181b7130 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -18,9 +18,9 @@
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_OP("BuildDenseInequalitySplits")
     .Attr("feature_column_group_id: int")
diff --git a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
index 0354f7853cbedf22d0a299273b4dbd225b3121ab..179505eef01f79bb149137400468b84285fe478a 100644
--- a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
@@ -19,9 +19,9 @@
 
 namespace tensorflow {
 namespace boosted_trees {
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(StatsAccumulatorScalarResource);
 
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index eefa7ef0dccf5e88099974302dd26eebe21b1bd2..81f58de28cbe98bb996c6665114eeb0030ee52f9 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -183,11 +183,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(num_quantiles + 1, len(buckets))
       self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
 
-  def _testStreamingQuantileBucketsHelper(self, inputs):
+  def _testStreamingQuantileBucketsHelper(
+      self, inputs, num_quantiles=3, expected_buckets=None):
     """Helper to test quantile buckets on different inputs."""
 
-    # Use 3 quantiles, 4 boundaries for simplicity.
-    num_quantiles = 3
     # set generate_quantiles to True since the test will generate fewer
     # boundaries otherwise.
     with self.test_session() as sess:
@@ -213,7 +212,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       buckets, are_ready_flush = (sess.run(
           [buckets, are_ready_flush]))
       self.assertEqual(True, are_ready_flush)
+      # By default, use 3 quantiles, 4 boundaries for simplicity.
       self.assertEqual(num_quantiles + 1, len(buckets))
+      if expected_buckets:
+        self.assertAllEqual(buckets, expected_buckets)
 
   def testStreamingQuantileBucketsRepeatedSingleValue(self):
     inputs = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
@@ -231,6 +233,28 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     inputs = [5]
     self._testStreamingQuantileBucketsHelper(inputs)
 
+  def testStreamingQuantileBucketsEqualDistributionInSequence(self):
+    # Input pattern is of the form [1, 1, 1, 2, 2, 2, 3, 3, 3, ...]
+    ones = 100 * [1]
+    inputs = []
+    for i in range(1, 101):
+      inputs += [i * k for k in ones]
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
+  def testStreamingQuantileBucketsEqualDistributionInterleaved(self):
+    # Input pattern is of the form [1, 2, 3, 1, 2, 3, 1, 2, 3, ...]
+    sequence = range(1, 101)
+    inputs = []
+    for _ in range(1, 101):
+      inputs += sequence
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
   def testStreamingQuantileBuckets(self):
     """Sets up the quantile summary op test as follows.
 
diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index 23168bf4935e92bcb5072348361ae04861641b6d..7a5f329b7ab3216972180ccbb4c85f2537175422 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -81,32 +81,32 @@ def _scheduled_stamp_resource_op_runner(batch, stamp):
   if not batch:
     return
   arg_keys = set(batch[0].args.keys())
-  grouped_args = collections.defaultdict(list)
+  grouped_args = collections.OrderedDict()
   resource_handles = []
   # Check that the set of arguments is the same across all the scheduled ops.
   for op in batch:
     if set(op.args.keys()) != arg_keys:
       raise ValueError("Mismatching arguments: %s, %s.", op.args, arg_keys)
     for key in arg_keys:
-      grouped_args[key].append(op.args[key])
+      grouped_args.setdefault(key, []).append(op.args[key])
     resource_handles.append(op.resource_handle)
   # Move all the inputs to the op device in one RPC.
-  grouped_args = {
-      k: _move_tensors(v, resource_handles[0].device)
-      for k, v in grouped_args.items()
-  }
+  grouped_args = collections.OrderedDict(
+      (k, _move_tensors(v, resource_handles[0].device))
+      for k, v in sorted(grouped_args.items()))
   with ops.device(resource_handles[0].device):
     return batch[0].op(resource_handles, stamp, **grouped_args)
 
 
 def run_handler_scheduled_ops(per_handler_ops, stamp, worker_device):
   """Given a dictionary of ops for each handler, runs them in batch."""
-  batched_ops = collections.defaultdict(list)
+  batched_ops = collections.OrderedDict()
   # Group the ops by their batching_key. Ops that share the same batching key
   # can be executed together.
   for handler in per_handler_ops.keys():
     for op in per_handler_ops[handler]:
-      batched_ops[(op.batching_key(), op.batch_runner_fn())].append(op)
+      key = (op.batching_key(), op.batch_runner_fn())
+      batched_ops.setdefault(key, []).append(op)
   op_results = {}
   for batch in batched_ops.values():
     # Run each of the batched ops using its runner.
diff --git a/tensorflow/contrib/boosted_trees/python/training/__init__.py b/tensorflow/contrib/boosted_trees/python/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b569ac5fdb60e0907c322ad73aca65645e548d94
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""training module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1750117cd7c311515b4bca6882d55f496daac0e
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""functions module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index b95956dae2a62b28643cd31815c5f5650eca337b..f0b66dcbbe1c5167b9993e66b30b1dc8a839c380 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 
 from tensorflow.contrib import learn
@@ -163,7 +164,7 @@ def extract_features(features, feature_columns):
     scope = "gbdt"
     with variable_scope.variable_scope(scope):
       feature_columns = list(feature_columns)
-      transformed_features = {}
+      transformed_features = collections.OrderedDict()
       for fc in feature_columns:
         # pylint: disable=protected-access
         if isinstance(fc, feature_column_lib._EmbeddingColumn):
@@ -681,13 +682,13 @@ class GradientBoostedDecisionTreeModel(object):
                               control_flow_ops.no_op))
 
     # Update handler stats.
-    handler_reads = {}
+    handler_reads = collections.OrderedDict()
     for handler in handlers:
       handler_reads[handler] = handler.scheduled_reads()
 
     handler_results = batch_ops_utils.run_handler_scheduled_ops(
         handler_reads, ensemble_stamp, worker_device)
-    per_handler_updates = {}
+    per_handler_updates = collections.OrderedDict()
     # Two values per handler. First one is if the handler is active for the
     # current layer. The second one is if the handler is going to be active
     # for the next layer.
diff --git a/tensorflow/contrib/boosted_trees/python/utils/__init__.py b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ceb150c26552584d631948f5eef2fedfa690894
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""utils module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
index deb324634b6edc17c9725996115d80c5bd11cbde..1bfd27305d569668a0bd67d876e59eec082296b3 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 
 namespace tensorflow {
-
 namespace {
 
 constexpr size_t kBufferSize = 1024 * 1024;  // In bytes.
@@ -40,33 +39,6 @@ Status ParseJson(StringPiece json, Json::Value* result) {
   return Status::OK();
 }
 
-string ColumnTypeToString(BigQueryTableAccessor::ColumnType enum_type) {
-  switch (enum_type) {
-    case BigQueryTableAccessor::ColumnType::kRecord:
-      return "RECORD";
-    case BigQueryTableAccessor::ColumnType::kString:
-      return "STRING";
-    case BigQueryTableAccessor::ColumnType::kBytes:
-      return "BYTES";
-    case BigQueryTableAccessor::ColumnType::kInteger:
-      return "INTEGER";
-    case BigQueryTableAccessor::ColumnType::kFloat:
-      return "FLOAT";
-    case BigQueryTableAccessor::ColumnType::kBoolean:
-      return "BOOLEAN";
-    case BigQueryTableAccessor::ColumnType::kTimestamp:
-      return "TIMESTAMP";
-    case BigQueryTableAccessor::ColumnType::kDate:
-      return "DATE";
-    case BigQueryTableAccessor::ColumnType::kTime:
-      return "TIME";
-    case BigQueryTableAccessor::ColumnType::kDatetime:
-      return "DATETIME";
-    case BigQueryTableAccessor::ColumnType::kNone:
-      return "NONE";
-  }
-}
-
 Status ParseColumnType(const string& type,
                        BigQueryTableAccessor::ColumnType* enum_type) {
   if (type == "RECORD") {
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
index 59f23332983e2328286d3b1b8b8c8fa228be991e..fea6b15640ded74432f35112bc5d5d68e641c9dc 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
@@ -399,6 +399,6 @@ const string kTestEmptyRow = R"({
     }]}]})";
 
 }  // namespace
-}  // namepsace tensorflow
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 15abd2be0385eb776ff4f76484133efb6e34f076..80e18a43a71cc9d6c9e2ccf5836e50c6427a30f6 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -34,6 +34,7 @@ py_library(
         ":cluster_resolver_py",
         ":gce_cluster_resolver_py",
         ":tpu_cluster_resolver_py",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index d17501e87e79158b1602ac6ddecc091bd86f2c2d..b4d8cd4a7cf42e910e7506dbeec8656a2cef62eb 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -26,3 +26,15 @@ from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import
 from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
 from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
 # pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'TPUClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 817e96f5da0e7512a9fd99cc9a4b4c6025d7dd68..12bfd3c62b49c7f9ef40c84abf25268ceafcdbdc 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -134,6 +134,9 @@ if(WIN32)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
+
+  # Try to avoid flaky failures due to failed generation of generate.stamp files.
+  set(CMAKE_SUPPRESS_REGENERATION ON)
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index aedb793d2aef4bf6950cd074cd065909667eaf75..fd05fa6d47209edd825b6a97aa0b77b3f9cb8ee1 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
+set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a)
 
 if(WIN32)
   set(protobuf_STATIC_LIBRARIES 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 7db454bd83ec7fee463b8cd448f5a5ff4ba73258..a7938f1f0752a3e50ebdb18fbd81ed797bb037d7 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -6,6 +6,7 @@ tensorflow/core/example
 tensorflow/core/framework
 tensorflow/core/lib
 tensorflow/core/lib/core
+tensorflow/core/profiler
 tensorflow/core/protobuf
 tensorflow/core/util
 tensorflow/examples
@@ -33,9 +34,11 @@ tensorflow/python/grappler
 tensorflow/python/keras
 tensorflow/python/keras/activations
 tensorflow/python/keras/applications
+tensorflow/python/keras/applications/densenet
 tensorflow/python/keras/applications/inception_resnet_v2
 tensorflow/python/keras/applications/inception_v3
 tensorflow/python/keras/applications/mobilenet
+tensorflow/python/keras/applications/nasnet
 tensorflow/python/keras/applications/resnet50
 tensorflow/python/keras/applications/vgg16
 tensorflow/python/keras/applications/vgg19
@@ -214,6 +217,8 @@ tensorflow/contrib/input_pipeline/python/ops
 tensorflow/contrib/integrate
 tensorflow/contrib/integrate/python
 tensorflow/contrib/integrate/python/ops
+tensorflow/contrib/kafka/python
+tensorflow/contrib/kafka/python/ops
 tensorflow/contrib/keras
 tensorflow/contrib/keras/api
 tensorflow/contrib/keras/api/keras
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 138993db35252d3f1ab6326dff463bdc10cabdb1..c42bc35ce7e75c72a8719f6730f2f8b63b35febb 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -30,6 +30,7 @@ set(tf_op_lib_names
 		"list_ops"
     "lookup_ops"
     "logging_ops"
+    "manip_ops"
     "math_ops"
     "nn_ops"
     "no_op"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8862390d2b62f72c11d60f2ae48a845d22363f06..b7c816c24f82c7747f53b4c127866e6008085ef3 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -335,6 +335,7 @@ GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("manip_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index f67698eb99a38eae307b52e55de748a67b798cbd..77ea914380dfa4f3ec903e9fc7062d429f8c0d6f 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -31,6 +31,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import codecs
 import io
 import os
 import re
@@ -103,7 +104,7 @@ def main():
   for lib_path in args.input:
     proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path],
                             stdout=subprocess.PIPE)
-    for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+    for line in codecs.getreader("utf-8")(proc.stdout):
       cols = line.split()
       if len(cols) < 2:
         continue
@@ -131,7 +132,7 @@ def main():
     # We compare on undname but use the decorated name from candidates.
     dupes = 0
     proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
-    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+    for idx, line in enumerate(codecs.getreader("utf-8")(proc.stdout)):
       decorated = candidates[idx]
       if decorated in taken:
         # Symbol is already in output, done.
diff --git a/tensorflow/contrib/coder/README.md b/tensorflow/contrib/coder/README.md
index e1e867db5aa701eb73ee43a47cd3dcc2dc783a04..c6c379c458893551b765327c0c1cbfff7f24f9c3 100644
--- a/tensorflow/contrib/coder/README.md
+++ b/tensorflow/contrib/coder/README.md
@@ -30,7 +30,7 @@ following sense:
   around,
 - The number of CDF axes does not extend, i.e., `CDF.ndim == data.ndim + 1`.
 
-In the previous example where data has shape (10, 10), the followings are
+In the previous example where data has shape (10, 10), the following are
 acceptable CDF shapes:
 
 - (10, 10, 65)
diff --git a/tensorflow/contrib/coder/kernels/range_coder.cc b/tensorflow/contrib/coder/kernels/range_coder.cc
index f4f076b6c4e0c82cc297266bedc63034d5f5bf8b..21b35155ff317c6afbb1b86745f05385726505b6 100644
--- a/tensorflow/contrib/coder/kernels/range_coder.cc
+++ b/tensorflow/contrib/coder/kernels/range_coder.cc
@@ -276,7 +276,7 @@ void RangeEncoder::Finalize(string* sink) {
     }
   } else if (base_ != 0) {
     // If base == 0, then pick 0 from [base, base + size) and no zeros are
-    // explcitly written.
+    // explicitly written.
     //
     // Otherwise, pick (base + (2^16 - base[16:0])), i.e., round up base to the
     // next multiple of 2^16. As 2^16 < size, this value should be in the
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index 2108e42bce4eba1eed158fe85888f1699a69ba7e..29a593f6bcfa05dcafcdb2f94087380ad720dba1 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -169,6 +170,7 @@ class JITTest(test.TestCase):
       self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s)
 
 
+@test_util.with_c_api
 class CompilationEnabledInGradientTest(test.TestCase):
 
   def testCompilationInGradient(self):
@@ -188,7 +190,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
       for cg in c_grad_ops:
         self.assertTrue(cg.get_attr("_XlaCompile"))
       for ncg in nc_grad_ops:
-        with self.assertRaisesRegexp(ValueError, "No attr named"):
+        with self.assertRaisesRegexp(ValueError, "[Nn]o attr named"):
           ncg.get_attr("_XlaCompile")
 
       # d/dx (x ** 4) = 4 * (x ** 3)
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index bae66ffd4289308f2cbfc730ec50d057b13923fb..b806799202bff4f2f6dbf717fbeea74a04b8cd6e 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -35,10 +35,10 @@ from tensorflow.python.ops.variables import Variable
 from tensorflow.python.client.session import Session
 from tensorflow.python.framework import ops
 
-__all__ = ["copy_op_to_graph", "copy_variable_to_graph", "get_copied_op"]
+__all__ = ['copy_op_to_graph', 'copy_variable_to_graph', 'get_copied_op']
 
 
-def copy_variable_to_graph(org_instance, to_graph, scope=""):
+def copy_variable_to_graph(org_instance, to_graph, scope=''):
   """Given a `Variable` instance from one `Graph`, initializes and returns
   a copy of it from another `Graph`, under the specified scope
   (default `""`).
@@ -56,12 +56,11 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
   """
 
   if not isinstance(org_instance, Variable):
-    raise TypeError(str(org_instance) + " is not a Variable")
+    raise TypeError(str(org_instance) + ' is not a Variable')
 
   #The name of the new variable
-  if scope != "":
-    new_name = (scope + '/' +
-                org_instance.name[:org_instance.name.index(':')])
+  if scope != '':
+    new_name = (scope + '/' + org_instance.name[:org_instance.name.index(':')])
   else:
     new_name = org_instance.name[:org_instance.name.index(':')]
 
@@ -73,15 +72,15 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
   for name, collection in org_instance.graph._collections.items():
     if org_instance in collection:
       if (name == ops.GraphKeys.GLOBAL_VARIABLES or
-          name == ops.GraphKeys.TRAINABLE_VARIABLES or
-          scope == ''):
+          name == ops.GraphKeys.TRAINABLE_VARIABLES or scope == ''):
         collections.append(name)
       else:
         collections.append(scope + '/' + name)
 
   #See if its trainable.
-  trainable = (org_instance in org_instance.graph.get_collection(
-      ops.GraphKeys.TRAINABLE_VARIABLES))
+  trainable = (
+      org_instance in org_instance.graph.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES))
   #Get the initial value
   with org_instance.graph.as_default():
     temp_session = Session()
@@ -89,17 +88,17 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
 
   #Initialize the new variable
   with to_graph.as_default():
-    new_var = Variable(init_value,
-                       trainable,
-                       name=new_name,
-                       collections=collections,
-                       validate_shape=False)
+    new_var = Variable(
+        init_value,
+        trainable,
+        name=new_name,
+        collections=collections,
+        validate_shape=False)
 
   return new_var
 
 
-def copy_op_to_graph(org_instance, to_graph, variables,
-                     scope=""):
+def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
   """Returns a copy of an operation from another Graph under a specified scope.
 
   Given an `Operation` `org_instance` from one `Graph`,
@@ -139,14 +138,12 @@ def copy_op_to_graph(org_instance, to_graph, variables,
   #If a variable by the new name already exists, return the
   #correspondng tensor that will act as an input
   if new_name in copied_variables:
-    return to_graph.get_tensor_by_name(
-        copied_variables[new_name].name)
+    return to_graph.get_tensor_by_name(copied_variables[new_name].name)
 
   #If an instance of the same name exists, return appropriately
   try:
-    already_present = to_graph.as_graph_element(new_name,
-                                                allow_tensor=True,
-                                                allow_operation=True)
+    already_present = to_graph.as_graph_element(
+        new_name, allow_tensor=True, allow_operation=True)
     return already_present
   except:
     pass
@@ -184,20 +181,21 @@ def copy_op_to_graph(org_instance, to_graph, variables,
 
     #If it has an original_op parameter, copy it
     if op._original_op is not None:
-      new_original_op = copy_op_to_graph(op._original_op, to_graph,
-                                      variables, scope)
+      new_original_op = copy_op_to_graph(op._original_op, to_graph, variables,
+                                         scope)
     else:
       new_original_op = None
 
     #If it has control inputs, call this function recursively on each.
-    new_control_inputs = [copy_op_to_graph(x, to_graph, variables,
-                                        scope)
-                          for x in op.control_inputs]
+    new_control_inputs = [
+        copy_op_to_graph(x, to_graph, variables, scope)
+        for x in op.control_inputs
+    ]
 
     #If it has inputs, call this function recursively on each.
-    new_inputs = [copy_op_to_graph(x, to_graph, variables,
-                                scope)
-                  for x in op.inputs]
+    new_inputs = [
+        copy_op_to_graph(x, to_graph, variables, scope) for x in op.inputs
+    ]
 
     #Make a new node_def based on that of the original.
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
@@ -216,13 +214,8 @@ def copy_op_to_graph(org_instance, to_graph, variables,
     op_def = deepcopy(op._op_def)
 
     #Initialize a new Operation instance
-    new_op = ops.Operation(new_node_def,
-                           to_graph,
-                           new_inputs,
-                           output_types,
-                           new_control_inputs,
-                           input_types,
-                           new_original_op,
+    new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
+                           new_control_inputs, input_types, new_original_op,
                            op_def)
     #Use Graph's hidden methods to add the op
     to_graph._add_op(new_op)  # pylint: disable=protected-access
@@ -233,10 +226,10 @@ def copy_op_to_graph(org_instance, to_graph, variables,
     return new_op
 
   else:
-    raise TypeError("Could not copy instance: " + str(org_instance))
+    raise TypeError('Could not copy instance: ' + str(org_instance))
 
 
-def get_copied_op(org_instance, graph, scope=""):
+def get_copied_op(org_instance, graph, scope=''):
   """Given an `Operation` instance from some `Graph`, returns
   its namesake from `graph`, under the specified scope
   (default `""`).
@@ -259,5 +252,5 @@ def get_copied_op(org_instance, graph, scope=""):
   else:
     new_name = org_instance.name
 
-  return graph.as_graph_element(new_name, allow_tensor=True,
-                                allow_operation=True)
+  return graph.as_graph_element(
+      new_name, allow_tensor=True, allow_operation=True)
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 9e41e67857101534e8bfef8d5d0b8a45ed8f1f76..1a79bf066c3a27e040099729fb079ee963f59270 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -251,9 +251,8 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
       TF_RETURN_IF_ERROR(c->GetAttr("num_params", &num_params));
       // Set shape for weight matrices
       for (int i = 0; i < num_params; i++) {
-        c->set_output(i,
-                      c->Matrix(InferenceContext::kUnknownDim,
-                                InferenceContext::kUnknownDim));
+        c->set_output(i, c->Matrix(InferenceContext::kUnknownDim,
+                                   InferenceContext::kUnknownDim));
       }
       // Set shape for bias vectors
       for (int i = 0; i < num_params; i++) {
@@ -300,6 +299,7 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
     Each layer may contain multiple parameter sets, with each set consisting of
     a weight matrix and a bias vector.
-)doc", kCudnnRNNCommonAttrs));
+)doc",
+                         kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 4fc5ff1bd1887c4532e95fcf0e791d72b20471b0..56c562a3bad1f9e8a55a25eb3273fb2c9fbdd4b5 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import time
 
+from six.moves import xrange
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import lstm_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 1cf0202fd88951ffcc611af39fa0915110c4d819..04a21f2b0f5b508fcd2e6bafa6a6ebdb555329b4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -126,6 +126,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 015f69c5673f185c53e61a5df2636333699ae203..0c2827b1e49919d236aeb922645236251f1344e0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -744,6 +744,23 @@ class BatchDatasetSerializationTest(
                         lambda: self._build_dataset_dense_to_sparse(diff_comp),
                         num_outputs)
 
+  def _sparse(self, i):
+    return sparse_tensor.SparseTensorValue(
+        indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+  def _build_dataset_sparse(self, batch_size=5):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(batch_size)
+
+  def testSparseCore(self):
+    self.run_core_tests(self._build_dataset_sparse,
+                        lambda: self._build_dataset_sparse(2), 2)
+
+  def _build_dataset_nested_sparse(self):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(5).batch(2)
+
+  def testNestedSparseCore(self):
+    self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
+
 
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 4d984bb4d76e52c4200ae471550dcf48668c5f89..6de93059d8cfbcf43c4e359b6973c33f22fff5fa 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -41,8 +41,7 @@ class GroupByWindowTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
         .apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4))
-        .make_initializable_iterator())
+                                     4)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -53,7 +52,8 @@ class GroupByWindowTest(test.TestCase):
         while True:
           result = sess.run(get_next)
           self.assertTrue(
-              all(x % 2 == 0 for x in result) or all(x % 2 == 1)
+              all(x % 2 == 0
+                  for x in result) or all(x % 2 == 1)
               for x in result)
           counts.append(result.shape[0])
 
@@ -116,8 +116,8 @@ class GroupByWindowTest(test.TestCase):
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))
-        .make_initializable_iterator())
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
+                                     32)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -136,7 +136,8 @@ class GroupByWindowTest(test.TestCase):
           window.padded_batch(
               4, padded_shapes=tensor_shape.TensorShape([None])),
           window.padded_batch(
-              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),))
+              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
+      ))
 
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
@@ -200,9 +201,10 @@ class BucketTest(test.TestCase):
     # dynamically and does not rely on static shape information about
     # the arguments.
     return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-            32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape([None]),
-                 tensor_shape.TensorShape([3])))))
+        (dataset_ops.Dataset.from_tensors(bucket),
+         window.padded_batch(
+             32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
+                 [None]), tensor_shape.TensorShape([3])))))
 
   def testSingleBucket(self):
 
@@ -307,12 +309,13 @@ class BucketTest(test.TestCase):
 
     def _dynamic_pad_fn(bucket, window, _):
       return dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-              32, {
-                  "x": tensor_shape.TensorShape([]),
-                  "y": tensor_shape.TensorShape([None]),
-                  "z": tensor_shape.TensorShape([3])
-              })))
+          (dataset_ops.Dataset.from_tensors(bucket),
+           window.padded_batch(
+               32, {
+                   "x": tensor_shape.TensorShape([]),
+                   "y": tensor_shape.TensorShape([None]),
+                   "z": tensor_shape.TensorShape([3])
+               })))
 
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 7cde6e05b244773966fd7c1bd4ca1e95abf7fd5e..dbc35097ddda9f0375060d43aeb43efa8107f929 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -24,9 +24,11 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -34,14 +36,29 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
 
+def remove_variants(get_next_op):
+  # TODO(b/72408568): Remove this once session.run can get
+  # variant tensors.
+  """Remove variants from a nest structure, so sess.run will execute."""
+
+  def _remove_variant(x):
+    if isinstance(x, ops.Tensor) and x.dtype == dtypes.variant:
+      return ()
+    else:
+      return x
+
+  return nest.map_structure(_remove_variant, get_next_op)
+
+
 class DatasetSerializationTestBase(test.TestCase):
   """Base class for testing serializable datasets."""
 
   def tearDown(self):
     self._delete_ckpt()
 
-  # TODO(b/70988345): Support native `tf.SparseTensor` objects and get rid of
-  # `sparse_tensors` argument.
+  # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
+  # (deprecated) saveable `SparseTensorSliceDataset`, once the API
+  # `from_sparse_tensor_slices()`and related tests are deleted.
   def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
     """Runs the core tests.
 
@@ -233,10 +250,10 @@ class DatasetSerializationTestBase(test.TestCase):
       saver = self._import_meta_graph()
       init_op, get_next_op = self._get_iterator_ops_from_collection(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(num_outputs):
           actual.append(sess.run(get_next_op))
         if verify_exhausted:
@@ -296,6 +313,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       _, get_next_op, saver = self._build_graph(
           ds_fn2, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -356,6 +374,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       get_next_op, saver = self._build_empty_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -389,9 +408,9 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, saver = self._build_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(break_point):
           sess.run(get_next_op)
         with self.assertRaises(error):
@@ -485,20 +504,20 @@ class DatasetSerializationTestBase(test.TestCase):
       else:
         init_op, get_next_op, saver = self._build_graph(
             ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       return init_op, get_next_op, saver
 
     for i in range(len(break_points) + 1):
       with ops.Graph().as_default() as g:
         init_op, get_next_op, saver = get_ops()
+        get_next_op = remove_variants(get_next_op)
         with self.test_session(graph=g) as sess:
           if ckpt_saved:
             if init_before_restore:
-              sess.run(variables.global_variables_initializer())
-              sess.run(init_op)
+              self._initialize(init_op, sess)
             self._restore(saver, sess)
           else:
-            sess.run(variables.global_variables_initializer())
-            sess.run(init_op)
+            self._initialize(init_op, sess)
           start = break_points[i - 1] if i > 0 else 0
           end = break_points[i] if i < len(break_points) else num_outputs
           num_iters = end - start
@@ -562,13 +581,16 @@ class DatasetSerializationTestBase(test.TestCase):
       get_next = sparse_tensor.SparseTensor(*iterator.get_next())
     else:
       get_next = iterator.get_next()
-    self._add_iterator_ops_to_collection(init_op, get_next, sparse_tensors)
+    self._add_iterator_ops_to_collection(init_op, get_next, ds_fn,
+                                         sparse_tensors)
     saver = saver_lib.Saver(allow_empty=True)
     return init_op, get_next, saver
 
   def _build_empty_graph(self, ds_fn, sparse_tensors=False):
     iterator = iterator_ops.Iterator.from_structure(
-        self._get_output_types(ds_fn), self._get_output_shapes(ds_fn))
+        self._get_output_types(ds_fn),
+        output_shapes=self._get_output_shapes(ds_fn),
+        output_classes=self._get_output_classes(ds_fn))
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     if sparse_tensors:
@@ -581,12 +603,19 @@ class DatasetSerializationTestBase(test.TestCase):
   def _add_iterator_ops_to_collection(self,
                                       init_op,
                                       get_next,
+                                      ds_fn,
                                       sparse_tensors=False):
     ops.add_to_collection("iterator_ops", init_op)
     # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
     # do not support tuples we flatten the tensors and restore the shape in
     # `_get_iterator_ops_from_collection`.
-    if sparse_tensors:
+
+    # TODO(shivaniagrwal): `output_classes` is a nested structure of classes,
+    # this base class is specific to current test cases. Update when tests are
+    # added with `output_classes` as a nested structure with at least one of the
+    # component being `tf.SparseTensor`.
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       ops.add_to_collection("iterator_ops", get_next.indices)
       ops.add_to_collection("iterator_ops", get_next.values)
       ops.add_to_collection("iterator_ops", get_next.dense_shape)
@@ -596,7 +625,8 @@ class DatasetSerializationTestBase(test.TestCase):
 
   def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
     all_ops = ops.get_collection("iterator_ops")
-    if sparse_tensors:
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       init_op, indices, values, dense_shape = all_ops
       return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
     else:
@@ -611,6 +641,10 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default():
       return ds_fn().output_shapes
 
+  def _get_output_classes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_classes
+
   def _ckpt_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
@@ -621,8 +655,14 @@ class DatasetSerializationTestBase(test.TestCase):
     saver.save(sess, self._ckpt_path())
 
   def _restore(self, saver, sess):
+    sess.run(lookup_ops.tables_initializer())
     saver.restore(sess, self._latest_ckpt())
 
+  def _initialize(self, init_op, sess):
+    sess.run(variables.global_variables_initializer())
+    sess.run(lookup_ops.tables_initializer())
+    sess.run(init_op)
+
   def _import_meta_graph(self):
     meta_file_path = self._ckpt_path() + ".meta"
     return saver_lib.import_meta_graph(meta_file_path)
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 5921be2ae89ba1bbbb8d6e3a509cf49c65949544..06883934d044c2c5faf467dd1708b858a2f8f9ab 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -194,6 +194,10 @@ class FilterDatasetSerializationTest(
     return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
         lambda x, i: x)
 
+  def testSparseCore(self):
+    num_outputs = 5
+    self.run_core_tests(self._build_sparse_filter, None, num_outputs)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index d4fbaa5cdcdd315aa0524134b48eb0515169722c..86d69495ef47da0bc93b8d9b1299e552fc676ee1 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -225,6 +225,21 @@ class FlatMapDatasetSerializationTest(
 
     self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError)
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_ds():
+      return dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+
+    self.run_core_tests(_build_ds, None, 20)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index b1937c08f347734d0d6871bd30ed209ff520623a..db8429512bf2bf944e67b65d185aca99477c86d3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -252,6 +252,22 @@ class InterleaveDatasetSeriazationTest(
         None, num_outputs)
     # pylint: enable=g-long-lambda
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_dataset():
+      return dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+          _interleave_fn, cycle_length=1)
+
+    self.run_core_tests(_build_dataset, None, 20)
+
 
 class ParallelInterleaveDatasetTest(test.TestCase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index dd8247bfd47a9880c7cfe905103702e43b1f2165..d3ce89298be342e22f12c46e8e8213ef636d0dc6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -805,6 +805,21 @@ class MapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
+  def testSparseCore(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def _build_ds(num_outputs):
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(_sparse)
+
+    num_outputs = 10
+    self.run_core_tests(lambda: _build_ds(num_outputs),
+                        lambda: _build_ds(int(num_outputs / 2)), num_outputs)
+
 
 class ParallelMapDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -851,7 +866,8 @@ class ParallelMapDatasetSerializationTest(
         return random_ops.random_uniform(
             (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(100).map(_map_fn)
+      return contrib_dataset_ops.Dataset.range(100).map(
+          _map_fn, num_parallel_calls=2).prefetch(2)
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -861,7 +877,8 @@ class ParallelMapDatasetSerializationTest(
       counter_var = variable_scope.get_variable(
           "counter", (), dtypes.int32, use_resource=True)
       return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
-          lambda _: counter_var.assign_add(1)))
+          lambda _: counter_var.assign_add(1),
+          num_parallel_calls=2).prefetch(2))
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -870,7 +887,7 @@ class ParallelMapDatasetSerializationTest(
     def _build_ds():
       constant_var = constant_op.constant(5)
       return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
-          lambda x: x + constant_var))
+          lambda x: x + constant_var, num_parallel_calls=2).prefetch(2))
 
     self.run_core_tests(_build_ds, None, 10)
 
@@ -883,7 +900,8 @@ class ParallelMapDatasetSerializationTest(
       def defun_fn(x):
         return constant_op.constant(1000) + math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
@@ -901,7 +919,8 @@ class ParallelMapDatasetSerializationTest(
 
         return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index 1dd0729513c0d46db25226178eb17b41efaae0ae..9cd1701c397b5a0bf5cc47c1bcab033704794d80 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
@@ -161,8 +162,10 @@ class _StatsDataset(dataset_ops.Dataset):
     return self._op_function(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 59cc5eae066339cf22e18730440446e6088519ba..60a187e541df4a794ae3944c30c427944915f7d0 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -85,6 +85,7 @@ from tensorflow.python.ops.distributions.uniform import *
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
+    'auto_correlation',
     'bijectors',
     'Cauchy',
     'ConditionalDistribution',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index a255d4fc890e67180532e342332a8e3f63a869cd..31d24aa9ea09007b8db40e4869371b1f62639ac7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -23,10 +23,15 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import mixture
+from tensorflow.contrib.distributions.python.ops import mixture_same_family
+from tensorflow.contrib.distributions.python.ops import mvn_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.ops.linalg import linear_operator_diag
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -395,6 +400,41 @@ class MixtureStddevTest(test.TestCase):
     self.assertAllClose(actual_devs, expected_devs)
 
 
+class PadMixtureDimensionsTest(test.TestCase):
+
+  def test_pad_mixture_dimensions_mixture(self):
+    with self.test_session() as sess:
+      gm = mixture.Mixture(
+          cat=categorical.Categorical(probs=[[0.3, 0.7]]),
+          components=[
+              normal.Normal(loc=[-1.0], scale=[1.0]),
+              normal.Normal(loc=[1.0], scale=[0.5])
+          ])
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.cat, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+  def test_pad_mixture_dimensions_mixture_same_family(self):
+    with self.test_session() as sess:
+      gm = mixture_same_family.MixtureSameFamily(
+          mixture_distribution=categorical.Categorical(probs=[0.3, 0.7]),
+          components_distribution=mvn_diag.MultivariateNormalDiag(
+              loc=[[-1., 1], [1, -1]], scale_identity_multiplier=[1.0, 0.5]))
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.mixture_distribution, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2, 1])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+
 class _PadTest(object):
 
   def testNegAxisCorrectness(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea3c86b5c0f42b64fc6e4e362cbcc162bccf74a2
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
@@ -0,0 +1,388 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import kumaraswamy as kumaraswamy_lib
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
+
+
+def _kumaraswamy_mode(a, b):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return ((a - 1) / (a * b - 1))**(1 / a)
+
+
+def _kumaraswamy_moment(a, b, n):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return b * special.beta(1.0 + n / a, b)
+
+
+def _harmonic_number(b):
+  b = np.asarray(b)
+  return special.psi(b + 1) - special.psi(1)
+
+
+def _kumaraswamy_cdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return 1 - (1 - x**a)**b
+
+
+def _kumaraswamy_pdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return a * b * x ** (a - 1) * (1 - x ** a) ** (b - 1)
+
+
+class KumaraswamyTest(test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      a = np.random.rand(3)
+      b = np.random.rand(3)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape)
+
+  def testComplexShapes(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(3, 2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testComplexShapesBroadcast(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testAProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration1.get_shape())
+      self.assertAllClose(a, dist.concentration1.eval())
+
+  def testBProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration0.get_shape())
+      self.assertAllClose(b, dist.concentration0.eval())
+
+  def testPdfXProper(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, validate_args=True)
+      dist.prob([.1, .3, .6]).eval()
+      dist.prob([.2, .3, .5]).eval()
+      # Either condition can trigger.
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([-1., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([0., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be no larger than `1`"):
+        dist.prob([.1, .2, 1.2]).eval()
+
+  def testPdfTwoBatches(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.5, .5]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfTwoBatchesNontrivialX(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.3, .7]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfUniformZeroBatch(self):
+    with self.test_session():
+      # This is equivalent to a uniform distribution
+      a = 1.
+      b = 1.
+      x = np.array([.1, .2, .3, .5, .8], dtype=np.float32)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((5,), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2]]
+      b = [[1., 2]]
+      x = [[.5, .5], [.3, .7]]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [[.5, .5], [.2, .8]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [[.5, .5]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [.5, .5]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testKumaraswamyMean(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(expected_mean, dist.mean().eval())
+
+  def testKumaraswamyVariance(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(expected_variance, dist.variance().eval())
+
+  def testKumaraswamyMode(self):
+    with session.Session():
+      a = np.array([1.1, 2, 3])
+      b = np.array([2., 4, 1.2])
+      expected_mode = _kumaraswamy_mode(a, b)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mode().get_shape(), (3,))
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyModeInvalid(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+  def testKumaraswamyModeEnableAllowNanStats(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyEntropy(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = (1 - 1. / a) + (
+          1 - 1. / b) * _harmonic_number(b) + np.log(a * b)
+      self.assertAllClose(expected_entropy, dist.entropy().eval())
+
+  def testKumaraswamySample(self):
+    with self.test_session():
+      a = 1.
+      b = 2.
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000,))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertLess(
+          stats.kstest(
+              # Kumaraswamy is a univariate distribution.
+              sample_values,
+              lambda x: _kumaraswamy_cdf(1., 2., x))[0],
+          0.01)
+      # The standard error of the sample mean is 1 / (sqrt(18 * n))
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(sample_values.mean(axis=0), expected_mean, atol=1e-2)
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(
+          np.cov(sample_values, rowvar=0), expected_variance, atol=1e-1)
+
+  # Test that sampling with the same seed twice gives the same results.
+  def testKumaraswamySampleMultipleTimes(self):
+    with self.test_session():
+      a_val = 1.
+      b_val = 2.
+      n_val = 100
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy1 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy1")
+      samples1 = kumaraswamy1.sample(n_val, seed=123456).eval()
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy2 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy2")
+      samples2 = kumaraswamy2.sample(n_val, seed=123456).eval()
+
+      self.assertAllClose(samples1, samples2)
+
+  def testKumaraswamySampleMultidimensional(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2).astype(np.float32)
+      b = np.random.rand(3, 2, 2).astype(np.float32)
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertAllClose(
+          sample_values[:, 1, :].mean(axis=0),
+          _kumaraswamy_moment(a, b, 1)[1, :],
+          atol=1e-1)
+
+  def testKumaraswamyCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = kumaraswamy_lib.Kumaraswamy(a, b).cdf(x).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+  def testKumaraswamyLogCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = math_ops.exp(kumaraswamy_lib.Kumaraswamy(a,
+                                                          b).log_cdf(x)).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index 1e514fe0ff21cd53c8c235da417890773db50c37..02064891758a86c5108e11da6a3666f2d5c56c64 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -107,7 +107,7 @@ def _test_capture_normal_sample_outputs():
   ds.Normal._call_sample_n = true_normal_call_sample_n
 
 
-def make_univariate_mixture(batch_shape, num_components):
+def make_univariate_mixture(batch_shape, num_components, use_static_graph):
   batch_shape = ops.convert_to_tensor(batch_shape, dtypes.int32)
   logits = random_ops.random_uniform(
       array_ops.concat((batch_shape, [num_components]), axis=0),
@@ -119,11 +119,11 @@ def make_univariate_mixture(batch_shape, num_components):
       for _ in range(num_components)
   ]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 def make_multivariate_mixture(batch_shape, num_components, event_shape,
-                              batch_shape_tensor=None):
+                              use_static_graph, batch_shape_tensor=None):
   if batch_shape_tensor is None:
     batch_shape_tensor = batch_shape
   batch_shape_tensor = ops.convert_to_tensor(batch_shape_tensor, dtypes.int32)
@@ -145,15 +145,17 @@ def make_multivariate_mixture(batch_shape, num_components, event_shape,
         loc=loc, scale_diag=scale_diag)
   components = [create_component() for _ in range(num_components)]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 class MixtureTest(test.TestCase):
+  use_static_graph = False
 
   def testShapes(self):
     with self.test_session():
       for batch_shape in ([], [1], [2, 3, 4]):
-        dist = make_univariate_mixture(batch_shape, num_components=10)
+        dist = make_univariate_mixture(batch_shape, num_components=10,
+                                       use_static_graph=self.use_static_graph)
         self.assertAllEqual(batch_shape, dist.batch_shape)
         self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
         self.assertAllEqual([], dist.event_shape)
@@ -161,7 +163,8 @@ class MixtureTest(test.TestCase):
 
         for event_shape in ([1], [2]):
           dist = make_multivariate_mixture(
-              batch_shape, num_components=10, event_shape=event_shape)
+              batch_shape, num_components=10, event_shape=event_shape,
+              use_static_graph=self.use_static_graph)
           self.assertAllEqual(batch_shape, dist.batch_shape)
           self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
           self.assertAllEqual(event_shape, dist.event_shape)
@@ -172,7 +175,8 @@ class MixtureTest(test.TestCase):
                                              r"cat.num_classes != len"):
       ds.Mixture(
           ds.Categorical([0.1, 0.5]),  # 2 classes
-          [ds.Normal(loc=1.0, scale=2.0)])
+          [ds.Normal(loc=1.0, scale=2.0)],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(
         ValueError, r"\(\) and \(2,\) are not compatible"):
       # The value error is raised because the batch shapes of the
@@ -185,13 +189,15 @@ class MixtureTest(test.TestCase):
                   loc=1.0, scale=2.0),  # scalar dist
               ds.Normal(
                   loc=[1.0, 1.0], scale=[2.0, 2.0])
-          ])
+          ],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, r"Could not infer"):
       cat_logits = array_ops.placeholder(shape=[1, None], dtype=dtypes.float32)
       ds.Mixture(
           ds.Categorical(cat_logits),
           [ds.Normal(
-              loc=[1.0], scale=[2.0])])
+              loc=[1.0], scale=[2.0])],
+          use_static_graph=self.use_static_graph)
 
   def testBrokenShapesDynamic(self):
     with self.test_session():
@@ -203,29 +209,37 @@ class MixtureTest(test.TestCase):
                   loc=d0_param, scale=d0_param), ds.Normal(
                       loc=d1_param, scale=d1_param)
           ],
-          validate_args=True)
-      with self.assertRaisesOpError(r"batch shape must match"):
+          validate_args=True,
+          use_static_graph=self.use_static_graph)
+
+      if self.use_static_graph:
+        error_string = r"Shapes of all inputs must match"
+      else:
+        error_string = r"batch shape must match"
+
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: [1.0]})
-      with self.assertRaisesOpError(r"batch shape must match"):
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: 1.0})
 
   def testBrokenTypes(self):
     with self.assertRaisesWithPredicateMatch(TypeError, "Categorical"):
-      ds.Mixture(None, [])
+      ds.Mixture(None, [], use_static_graph=self.use_static_graph)
     cat = ds.Categorical([0.3, 0.2])
     # components must be a list of distributions
     with self.assertRaisesWithPredicateMatch(
         TypeError, "all .* must be Distribution instances"):
-      ds.Mixture(cat, [None])
+      ds.Mixture(cat, [None], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(TypeError, "same dtype"):
       ds.Mixture(
           cat, [
               ds.Normal(loc=[1.0], scale=[2.0]),
               ds.Normal(loc=[np.float16(1.0)],
                         scale=[np.float16(2.0)]),
-          ])
+          ], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, "non-empty list"):
-      ds.Mixture(ds.Categorical([0.3, 0.2]), None)
+      ds.Mixture(ds.Categorical([0.3, 0.2]), None,
+                 use_static_graph=self.use_static_graph)
 
     # TODO(ebrevdo): once distribution Domains have been added, add a
     # test to ensure that the domains of the distributions in a
@@ -235,7 +249,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=2)
+            batch_shape=batch_shape, num_components=2,
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape, mean.get_shape())
 
@@ -256,7 +271,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape + (4,), mean.get_shape())
 
@@ -283,7 +299,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=num_components)
+            batch_shape=batch_shape, num_components=num_components,
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape, dev.get_shape())
 
@@ -325,7 +342,8 @@ class MixtureTest(test.TestCase):
         dist = make_multivariate_mixture(
             batch_shape=batch_shape,
             num_components=num_components,
-            event_shape=(4,))
+            event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape + (4,), dev.get_shape())
 
@@ -371,7 +389,8 @@ class MixtureTest(test.TestCase):
                       scale=component_devs[0]),
             ds.Normal(loc=component_means[1],
                       scale=component_devs[1]),
-        ])
+        ],
+        use_static_graph=self.use_static_graph)
     mix_dev = mixture_dist.stddev()
     with self.test_session() as sess:
       actual_stddev = sess.run(mix_dev)
@@ -379,7 +398,8 @@ class MixtureTest(test.TestCase):
 
   def testProbScalarUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[], num_components=2,
+                                     use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [1.0, 2.0], dtype=np.float32), np.array(
@@ -405,7 +425,8 @@ class MixtureTest(test.TestCase):
   def testProbScalarMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=2, event_shape=[3])
+          batch_shape=[], num_components=2, event_shape=[3],
+          use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [[-1.0, 0.0, 1.0], [0.5, 1.0, -0.3]], dtype=np.float32), np.array(
@@ -432,7 +453,8 @@ class MixtureTest(test.TestCase):
 
   def testProbBatchUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2,
+                                     use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3).astype(np.float32),
@@ -459,7 +481,8 @@ class MixtureTest(test.TestCase):
   def testProbBatchMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[2, 3], num_components=2, event_shape=[4])
+          batch_shape=[2, 3], num_components=2, event_shape=[4],
+          use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3, 4).astype(np.float32),
@@ -487,7 +510,8 @@ class MixtureTest(test.TestCase):
       num_components = 3
       batch_shape = []
       dist = make_univariate_mixture(
-          batch_shape=batch_shape, num_components=num_components)
+          batch_shape=batch_shape, num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -502,7 +526,10 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch univariate case: batch_size == 1, rank 1
-        which_dist_samples = dist_sample_values[c][:size_c]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c]
         self.assertAllClose(which_dist_samples, sample_values[which_c])
 
   # Test that sampling with the same seed twice gives the same results.
@@ -522,7 +549,8 @@ class MixtureTest(test.TestCase):
       ]
       cat = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat1")
-      dist1 = ds.Mixture(cat, components, name="mixture1")
+      dist1 = ds.Mixture(cat, components, name="mixture1",
+                         use_static_graph=self.use_static_graph)
       samples1 = dist1.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
@@ -532,7 +560,8 @@ class MixtureTest(test.TestCase):
       ]
       cat2 = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat2")
-      dist2 = ds.Mixture(cat2, components2, name="mixture2")
+      dist2 = ds.Mixture(cat2, components2, name="mixture2",
+                         use_static_graph=self.use_static_graph)
       samples2 = dist2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -541,7 +570,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       num_components = 3
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=num_components, event_shape=[2])
+          batch_shape=[], num_components=num_components, event_shape=[2],
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -555,14 +585,18 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch multivariate case: batch_size == 1, rank 2
-        which_dist_samples = dist_sample_values[c][:size_c, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c, :]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c, :]
         self.assertAllClose(which_dist_samples, sample_values[which_c, :])
 
   def testSampleBatchUnivariate(self):
     with self.test_session() as sess:
       num_components = 3
       dist = make_univariate_mixture(
-          batch_shape=[2, 3], num_components=num_components)
+          batch_shape=[2, 3], num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -576,8 +610,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 3
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1])
 
@@ -594,7 +632,8 @@ class MixtureTest(test.TestCase):
       dist = make_multivariate_mixture(
           batch_shape=batch_shape,
           num_components=num_components, event_shape=[4],
-          batch_shape_tensor=batch_shape_tensor)
+          batch_shape_tensor=batch_shape_tensor,
+          use_static_graph=self.use_static_graph)
       n = 5
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -617,8 +656,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 4 (multivariate)
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1, :]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1, :]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1, :])
 
@@ -632,7 +675,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         entropy_lower_bound = dist.entropy_lower_bound()
         self.assertEqual(batch_shape, entropy_lower_bound.get_shape())
 
@@ -673,7 +717,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=(), dtype=dtypes.float32)
 
@@ -721,7 +766,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=psize, dtype=dtypes.float32)
     xs_to_check = [
@@ -760,12 +806,18 @@ class MixtureTest(test.TestCase):
       gm = ds.Mixture(
           cat=ds.Categorical(probs=[.3, .7]),
           components=[ds.Gamma(1., 2.),
-                      ds.Gamma(2., 1.)])
+                      ds.Gamma(2., 1.)],
+          use_static_graph=self.use_static_graph)
       x_ = gm.sample().eval()
       self.assertAllEqual([], x_.shape)
 
 
+class MixtureStaticSampleTest(MixtureTest):
+  use_static_graph = True
+
+
 class MixtureBenchmark(test.Benchmark):
+  use_static_graph = False
 
   def _runSamplingBenchmark(self, name, create_distribution, use_gpu,
                             num_components, batch_size, num_features,
@@ -811,7 +863,7 @@ class MixtureBenchmark(test.Benchmark):
       components = list(
           ds.MultivariateNormalDiag(
               loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -853,7 +905,7 @@ class MixtureBenchmark(test.Benchmark):
           ds.MultivariateNormalTriL(
               loc=mu, scale_tril=linalg_ops.cholesky(sigma))
           for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -872,5 +924,9 @@ class MixtureBenchmark(test.Benchmark):
                   sample_size=sample_size)
 
 
+class MixtureStaticSampleBenchmark(MixtureBenchmark):
+  use_static_graph = True
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index dc8ae1eed19eda772219287d8661f534ac242d10..5251dbcb5748f75688aa43ce6e4e9dbd76be78bb 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -237,6 +237,11 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
       return y
 
     event_size = array_ops.shape(x)[-1]
+    # If the event size is available at graph construction time, we can inform
+    # the graph compiler of the maximum number of steps. If not,
+    # static_event_size will be None, and the maximum_iterations argument will
+    # have no effect.
+    static_event_size = x.shape.with_rank_at_least(1)[-1].value
     y0 = array_ops.zeros_like(x, name="y0")
     # call the template once to ensure creation
     _ = self._shift_and_log_scale_fn(y0)
@@ -258,7 +263,8 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     _, y = control_flow_ops.while_loop(
         cond=lambda index, _: index < event_size,
         body=_loop_body,
-        loop_vars=[0, y0])
+        loop_vars=(0, y0),
+        maximum_iterations=static_event_size)
     return y
 
   def _inverse(self, y):
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index a4d249d41ec9733721a3583d3708e0da56db1733..289e1d50e1146a641c0cc433ece3465aed73b1c2 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -442,6 +443,44 @@ def maybe_check_scalar_distribution(
     return assertions
 
 
+def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution,
+                           event_ndims):
+  """Pad dimensions of event tensors for mixture distributions.
+
+  See `Mixture._sample_n` and `MixtureSameFamily._sample_n` for usage examples.
+
+  Args:
+    x: event tensor to pad.
+    mixture_distribution: Base distribution of the mixture.
+    categorical_distribution: `Categorical` distribution that mixes the base
+      distribution.
+    event_ndims: Integer specifying the number of event dimensions in the event
+      tensor.
+
+  Returns:
+    A padded version of `x` that can broadcast with `categorical_distribution`.
+  """
+  with ops.name_scope("pad_mix_dims", values=[x]):
+    def _get_ndims(d):
+      if d.batch_shape.ndims is not None:
+        return d.batch_shape.ndims
+      return array_ops.shape(d.batch_shape_tensor())[0]
+    dist_batch_ndims = _get_ndims(mixture_distribution)
+    cat_batch_ndims = _get_ndims(categorical_distribution)
+    pad_ndims = array_ops.where(
+        categorical_distribution.is_scalar_batch(),
+        dist_batch_ndims,
+        dist_batch_ndims - cat_batch_ndims)
+    s = array_ops.shape(x)
+    x = array_ops.reshape(x, shape=array_ops.concat([
+        s[:-1],
+        array_ops.ones([pad_ndims], dtype=dtypes.int32),
+        s[-1:],
+        array_ops.ones([event_ndims], dtype=dtypes.int32),
+    ], axis=0))
+    return x
+
+
 def static_value(x):
   """Returns the static value of a `Tensor` or `None`."""
   return tensor_util.constant_value(ops.convert_to_tensor(x))
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d5d8773cf3e69a52554c87d656fea2835c8354
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -0,0 +1,258 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Kumaraswamy distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import beta
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "Kumaraswamy",
+]
+
+_kumaraswamy_sample_note = """Note: `x` must have dtype `self.dtype` and be in
+`[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
+
+
+def _harmonic_number(x):
+  """Compute the harmonic number from its analytic continuation.
+
+  Derivation from [1] and Euler's constant [2].
+  [1] -
+  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers
+  [2] - https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant
+
+
+  Args:
+    x: input float.
+
+  Returns:
+    z: The analytic continuation of the harmonic number for the input.
+
+  """
+  one = array_ops.ones([], dtype=x.dtype)
+  return math_ops.digamma(x + one) - math_ops.digamma(one)
+
+
+@tf_export("distributions.Kumaraswamy")
+class Kumaraswamy(beta.Beta):
+  """Kumaraswamy distribution.
+
+  The Kumaraswamy distribution is defined over the `(0, 1)` interval using
+  parameters
+  `concentration1` (aka "alpha") and `concentration0` (aka "beta").  It has a
+  shape similar to the Beta distribution, but is reparameterizeable.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; alpha, beta) = alpha * beta * x**(alpha - 1) * (1 - x**alpha)**(beta -
+  1)
+  ```
+
+  where:
+
+  * `concentration1 = alpha`,
+  * `concentration0 = beta`,
+
+  Distribution parameters are automatically broadcast in all functions; see
+  examples for details.
+
+  #### Examples
+
+  ```python
+  # Create a batch of three Kumaraswamy distributions.
+  alpha = [1, 2, 3]
+  beta = [1, 2, 3]
+  dist = Kumaraswamy(alpha, beta)
+
+  dist.sample([4, 5])  # Shape [4, 5, 3]
+
+  # `x` has three batch entries, each with two samples.
+  x = [[.1, .4, .5],
+       [.2, .3, .5]]
+  # Calculate the probability of each pair of samples under the corresponding
+  # distribution in `dist`.
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  ```python
+  # Create batch_shape=[2, 3] via parameter broadcast:
+  alpha = [[1.], [2]]      # Shape [2, 1]
+  beta = [3., 4, 5]        # Shape [3]
+  dist = Kumaraswamy(alpha, beta)
+
+  # alpha broadcast as: [[1., 1, 1,],
+  #                      [2, 2, 2]]
+  # beta broadcast as:  [[3., 4, 5],
+  #                      [3, 4, 5]]
+  # batch_Shape [2, 3]
+  dist.sample([4, 5])  # Shape [4, 5, 2, 3]
+
+  x = [.2, .3, .5]
+  # x will be broadcast as [[.2, .3, .5],
+  #                         [.2, .3, .5]],
+  # thus matching batch_shape [2, 3].
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  """
+
+  def __init__(self,
+               concentration1=None,
+               concentration0=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Kumaraswamy"):
+    """Initialize a batch of Kumaraswamy distributions.
+
+    Args:
+      concentration1: Positive floating-point `Tensor` indicating mean
+        number of successes; aka "alpha". Implies `self.dtype` and
+        `self.batch_shape`, i.e.,
+        `concentration1.shape = [N1, N2, ..., Nm] = self.batch_shape`.
+      concentration0: Positive floating-point `Tensor` indicating mean
+        number of failures; aka "beta". Otherwise has same semantics as
+        `concentration1`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    super(Kumaraswamy, self).__init__(
+        concentration1=concentration1,
+        concentration0=concentration0,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._reparameterization_type = distribution.FULLY_REPARAMETERIZED
+
+  def _sample_n(self, n, seed=None):
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    uniform_sample = random_ops.random_uniform(
+        shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed)
+
+    kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**(
+        1. / expanded_concentration1)
+    return kumaraswamy_sample
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _log_cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return math_ops.log1p(-(1 - x**a)**b)
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return 1 - (1 - x**a)**b
+
+  def _survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - x**a)**b
+
+  def _log_survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return b * math_ops.log1p(-x**a)
+
+  def _log_unnormalized_prob(self, x):
+    x = self._maybe_assert_valid_sample(x)
+    a = self.concentration1
+    b = self.concentration0
+    return (a - 1) * math_ops.log(x) + (b - 1) * math_ops.log1p(-x**a)
+
+  def _log_normalization(self):
+    a = self.concentration1
+    b = self.concentration0
+    return -(math_ops.log(a) + math_ops.log(b))
+
+  def _entropy(self):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - 1. / a) + (
+        1 - 1. / b) * _harmonic_number(b) + math_ops.log(a) + math_ops.log(b)
+
+  def _moment(self, n):
+    """Compute the n'th (uncentered) moment."""
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    beta_arg0 = 1 + n / expanded_concentration1
+    beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1)
+    log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta(
+        beta_arg)
+    return math_ops.exp(log_moment)
+
+  def _mean(self):
+    return self._moment(1)
+
+  def _variance(self):
+    # TODO(b/72696533): Investigate a more numerically stable version.
+    return self._moment(2) - math_ops.square(self._moment(1))
+
+  @distribution_util.AppendDocstring(
+      """Note: The mode is undefined when `concentration1 <= 1` or
+      `concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
+      is used for undefined modes. If `self.allow_nan_stats` is `False` an
+      exception is raised when one or more modes are undefined.""")
+  def _mode(self):
+    a = self.concentration1
+    b = self.concentration0
+    mode = ((a - 1) / (a * b - 1))**(1. / a)
+    if self.allow_nan_stats:
+      nan = array_ops.fill(
+          self.batch_shape_tensor(),
+          np.array(np.nan, dtype=self.dtype.as_numpy_dtype),
+          name="nan")
+      is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.)
+      return array_ops.where(is_defined, mode, nan)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration1,
+            message="Mode undefined for concentration1 <= 1."),
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration0,
+            message="Mode undefined for concentration0 <= 1.")
+    ], mode)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index f2d492f5489a197157558ae727416b51db04793e..cef6a143fc615901315a3780bf4ed53b8c7cd177 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -71,6 +71,7 @@ class Mixture(distribution.Distribution):
                components,
                validate_args=False,
                allow_nan_stats=True,
+               use_static_graph=False,
                name="Mixture"):
     """Initialize a Mixture distribution.
 
@@ -96,6 +97,11 @@ class Mixture(distribution.Distribution):
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
         batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
+      use_static_graph: Calls to `sample` will not rely on dynamic tensor
+        indexing, allowing for some static graph compilation optimizations, but
+        at the expense of sampling all underlying distributions in the mixture.
+        (Possibly useful when running on TPUs).
+        Default value: `False` (i.e., use dynamic indexing).
       name: A name for this distribution (optional).
 
     Raises:
@@ -178,6 +184,10 @@ class Mixture(distribution.Distribution):
       self._static_event_shape = static_event_shape
       self._static_batch_shape = static_batch_shape
 
+      self._use_static_graph = use_static_graph
+      if use_static_graph and static_num_components is None:
+        raise ValueError("Number of categories must be known statically when "
+                         "`static_sample=True`.")
     # We let the Mixture distribution access _graph_parents since its arguably
     # more like a baseclass.
     graph_parents = self._cat._graph_parents  # pylint: disable=protected-access
@@ -292,6 +302,31 @@ class Mixture(distribution.Distribution):
       return mixture_log_cdf
 
   def _sample_n(self, n, seed=None):
+    if self._use_static_graph:
+      # This sampling approach is almost the same as the approach used by
+      # `MixtureSameFamily`. The differences are due to having a list of
+      # `Distribution` objects rather than a single object, and maintaining
+      # random seed management that is consistent with the non-static code path.
+      samples = []
+      cat_samples = self.cat.sample(n, seed=seed)
+      for c in range(self.num_components):
+        seed = distribution_util.gen_new_seed(seed, "mixture")
+        samples.append(self.components[c].sample(n, seed=seed))
+      x = array_ops.stack(
+          samples, -self._static_event_shape.ndims - 1)     # [n, B, k, E]
+      npdt = x.dtype.as_numpy_dtype
+      mask = array_ops.one_hot(
+          indices=cat_samples,                              # [n, B]
+          depth=self._num_components,                       # == k
+          on_value=np.ones([], dtype=npdt),
+          off_value=np.zeros([], dtype=npdt))               # [n, B, k]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self._cat,
+          self._static_event_shape.ndims)                   # [n, B, k, [1]*e]
+      return math_ops.reduce_sum(
+          x * mask,
+          axis=-1 - self._static_event_shape.ndims)         # [n, B, E]
+
     with ops.control_dependencies(self._assertions):
       n = ops.convert_to_tensor(n, name="n")
       static_n = tensor_util.constant_value(n)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 49afbea7f05136674aa0c1441bd46548b7b55c8f..b93bdc5ab4010663baddda1410b302644853648b 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import dtypes
+from tensorflow.contrib.distributions.python.ops import distribution_util as distribution_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -239,7 +239,9 @@ class MixtureSameFamily(distribution.Distribution):
           depth=self._num_components,                        # == k
           on_value=np.ones([], dtype=npdt),
           off_value=np.zeros([], dtype=npdt))                # [n, B, k]
-      mask = self._pad_mix_dims(mask)                        # [n, B, k, [1]*e]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [n, B, k, [1]*e]
       return math_ops.reduce_sum(
           x * mask, axis=-1 - self._event_ndims)             # [n, B, E]
 
@@ -254,8 +256,9 @@ class MixtureSameFamily(distribution.Distribution):
 
   def _mean(self):
     with ops.control_dependencies(self._runtime_assertions):
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       return math_ops.reduce_sum(
           probs * self.components_distribution.mean(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -271,8 +274,9 @@ class MixtureSameFamily(distribution.Distribution):
   def _variance(self):
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.variance(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -291,8 +295,12 @@ class MixtureSameFamily(distribution.Distribution):
 
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(self._pad_mix_dims(
-          self.mixture_distribution.probs))                  # [B, k, 1, 1]
+      probs = distribution_utils.pad_mixture_dimensions(
+          distribution_utils.pad_mixture_dimensions(
+              self.mixture_distribution.probs, self, self.mixture_distribution,
+              self._event_shape().ndims),
+          self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, 1, 1]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.covariance(),
           axis=-3)                                           # [B, e, e]
@@ -312,27 +320,6 @@ class MixtureSameFamily(distribution.Distribution):
           shape[:d], [1], shape[d:]], axis=0))
       return x
 
-  def _pad_mix_dims(self, x):
-    with ops.name_scope("pad_mix_dims", values=[x]):
-      def _get_ndims(d):
-        if d.batch_shape.ndims is not None:
-          return d.batch_shape.ndims
-        return array_ops.shape(d.batch_shape_tensor())[0]
-      dist_batch_ndims = _get_ndims(self)
-      cat_batch_ndims = _get_ndims(self.mixture_distribution)
-      pad_ndims = array_ops.where(
-          self.mixture_distribution.is_scalar_batch(),
-          dist_batch_ndims,
-          dist_batch_ndims - cat_batch_ndims)
-      s = array_ops.shape(x)
-      x = array_ops.reshape(x, shape=array_ops.concat([
-          s[:-1],
-          array_ops.ones([pad_ndims], dtype=dtypes.int32),
-          s[-1:],
-          array_ops.ones([self._event_ndims], dtype=dtypes.int32),
-      ], axis=0))
-      return x
-
 
 def _outer_squared_difference(x, y):
   """Convenience function analogous to tf.squared_difference."""
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 09242ee47ddd044dfc99e22d5b7751a989c86485..9d2ca07c3a25fa7acb9b0f5806b763d9a57b51fa 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -41,28 +41,8 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Since eager execution is not yet part of a TensorFlow release, using it requires
-either [building from source](https://www.tensorflow.org/install/install_sources)
-or the latest nightly builds. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment. Try out the notebooks below.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 544a3eafc08f892f6e3315f0656c97b9877cfa0e..d177bfeab2d1fdc05d7ced54df8723fae2c77fdb 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -112,7 +112,7 @@ class Iterator(object):
         remote_fn.add_to_graph(None)
         target = constant_op.constant("/device:CPU:0")
       with ops.device(self._device):
-        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(
+        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(  # pylint: disable=line-too-long
             string_arg=iter_string_handle,
             f=remote_fn,
             target_device=target,
@@ -120,8 +120,9 @@ class Iterator(object):
             thread_pool_size=1,
             container="",
             shared_name=_generate_shared_name("function_buffer_resource"))
-        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(
-            handle=self._buffer_resource_handle, handle_device=self._device)
+        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
+            handle=self._buffer_resource_handle,
+            handle_device=self._device)
 
   def __iter__(self):
     return self
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index 2a7be95811f6fff06e2c489890703561ed879c42..772f59562ba27cce510c82681f491d005298f44c 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -39,7 +39,7 @@ class MNISTModel(tfe.Network):
   """MNIST Network.
 
   Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/mnist/mnist_deep.py
   and
   https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
 
@@ -95,8 +95,7 @@ class MNISTModel(tfe.Network):
     x = self.max_pool2d(x)
     x = tf.layers.flatten(x)
     x = self.fc1(x)
-    if training:
-      x = self.dropout(x)
+    x = self.dropout(x, training=training)
     x = self.fc2(x)
     return x
 
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 76e06269b6bbeb3386a6346244d294b1c5167b6e..1f7beee68525e9cf338847caa0bb2dcc8bd60f62 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -22,6 +22,7 @@ import gc
 import tempfile
 import time
 
+from six.moves import xrange
 import tensorflow as tf
 
 import tensorflow.contrib.eager as tfe
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 84e25cf81a2223800c47994b26d000caddee6b01..19b0104c807cb43b16c3cd47dbacee6e890021db 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -26,6 +26,7 @@ import tempfile
 import time
 
 import numpy as np
+from six.moves import xrange
 import tensorflow as tf
 
 # pylint: disable=g-bad-import-order
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 7eea93ce1f5aefe82d73b49f57b636692818ba16..ffc1d0332eae605ce0444a225e53baa68954cae0 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -19,29 +19,34 @@ to models defined without using eager execution.
 
 ## Installation
 
-Eager execution is **not** included in the latest release (version 1.4) of
-TensorFlow. To use it, you will need to [build TensorFlow from
-source](https://www.tensorflow.org/install/install_sources) or install the
-nightly builds.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
-For example, the nightly builds can be installed using `pip`:
+The contents of this guide are compatible with TensorFlow 1.5.
+However, if you run into bugs that are fixed in source but not the
+release, you may want to either either [building from
+source](https://www.tensorflow.org/install/install_sources)
+or the try latest nightly builds. The nightly builds are available as:
 
--   `pip install tf-nightly` (for CPU-only TensorFlow)
--   `pip install tf-nightly-gpu` (for GPU-enabled TensorFlow)
+- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
 
-Or using `docker`, with [Jupyter Notebook](http://jupyter.org/) support:
+- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
+
+For example, to run the latest nightly docker image:
 
 ```sh
-# For CPU-only TensorFlow
+# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
+docker pull tensorflow/tensorflow:nightly-gpu
+docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
+
+# If you do not have a GPU, use the CPU-only image
 docker pull tensorflow/tensorflow:nightly
 docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-
-# For GPU-enabled TensorFlow:
-# (Requires https://github.com/NVIDIA/nvidia-docker)
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
 ```
 
+And then visit http://localhost:8888 in your browser for a Jupyter notebook
+environment.
+
 ## Getting Started
 
 With TensorFlow installed, eager execution is enabled via a single call:
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index bf029ca5f9dddb152274da6a1cc96bea7981d8fd..ea8dbf2b46ea4bd0e33645ae3c590c4dd13f7a52 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -291,6 +291,9 @@ class Mean(Metric):
     Args:
       values: Tensor with the per-example value.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     if weights is None:
       self.denom.assign_add(
@@ -302,6 +305,9 @@ class Mean(Metric):
       self.denom.assign_add(math_ops.reduce_sum(weights))
       values = math_ops.cast(values, self.dtype) * weights
       self.numer.assign_add(math_ops.reduce_sum(values))
+    if weights is None:
+      return values
+    return values, weights
 
   def result(self):
     t = self.numer / self.denom
@@ -329,7 +335,13 @@ class Accuracy(Mean):
         per element of the Tensor.
       predictions: Tensor with the predicted label for each example.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     matches = math_ops.equal(labels, predictions)
     matches = math_ops.cast(matches, dtypes.float64)
     super(Accuracy, self).call(matches, weights=weights)
+    if weights is None:
+      return labels, predictions
+    return labels, predictions, weights
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 9cf34fd9b2dcf1b123cacc6863af817419eda007..a9ecaa3f8bced3043ea0eb0ac3aa8bfa65e9e1ff 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -180,6 +180,19 @@ class MetricsTest(test.TestCase):
         m2 = metrics.Mean()
         m2(2)
 
+  def testMetricsChain(self):
+    with context.graph_mode(), self.test_session():
+      m1 = metrics.Mean()
+      m2 = metrics.Mean(name="m2")
+      update_m2 = m2(3.0)
+      update_m2_2 = m2(m1(1.0))
+      m1.init_variables().run()
+      m2.init_variables().run()
+      update_m2.eval()
+      update_m2_2.eval()
+      self.assertAllEqual(m2.result().eval(), 2.0)
+      self.assertAllEqual(m1.result().eval(), 1.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 0dedb2fd7c0905801cd87c239ff2ee09eecb6080..b6659c2a1797feab261d756e78b45231dbea5a02 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -102,10 +102,6 @@ class TFETest(test_util.TensorFlowTestCase):
     # Expect at least one device.
     self.assertTrue(tfe.list_devices())
 
-  def testNumGPUs(self):
-    devices = tfe.list_devices()
-    self.assertEqual(len(devices) - 1, tfe.num_gpus())
-
   def testAddCheckNumericsOpsRaisesError(self):
     with self.assertRaisesRegexp(
         RuntimeError,
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index cdbe05e4d2d7117c5acb12d679f359a9db17c9cc..6cdbed5b896577f5622b1bd0123c289c798bc0a5 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -163,7 +163,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
@@ -177,7 +177,6 @@ py_library(
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/estimator:util",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
     ],
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index fd0994490aac7b9a0ed628e0c3c624d0fefb1b81..238cf287b768eee28b20202084eb244c085c8b75 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
@@ -29,7 +28,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
@@ -45,6 +43,7 @@ def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
                      loss_reduction=losses.Reduction.SUM,
+                     loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi class classification.
 
@@ -65,6 +64,12 @@ def multi_class_head(n_classes,
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `binary_classification_head`).
@@ -79,6 +84,7 @@ def multi_class_head(n_classes,
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -94,12 +100,17 @@ def multi_class_head(n_classes,
       weight_column=weight_column,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
 def binary_classification_head(
-    weight_column=None, thresholds=None, label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM, name=None):
+    weight_column=None,
+    thresholds=None,
+    label_vocabulary=None,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a `_Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
@@ -119,6 +130,12 @@ def binary_classification_head(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -136,6 +153,7 @@ def binary_classification_head(
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -151,12 +169,14 @@ def binary_classification_head(
       thresholds=thresholds,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
 def regression_head(weight_column=None,
                     label_dimension=1,
                     loss_reduction=losses.Reduction.SUM,
+                    loss_fn=None,
                     name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
@@ -175,6 +195,10 @@ def regression_head(weight_column=None,
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, label_dimension]`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -185,6 +209,7 @@ def regression_head(weight_column=None,
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -198,6 +223,7 @@ def regression_head(weight_column=None,
       weight_column=weight_column,
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -287,7 +313,7 @@ def multi_label_head(n_classes,
           'Length of label_vocabulary must be n_classes ({}). '
           'Given: {}'.format(n_classes, len(label_vocabulary)))
   if loss_fn:
-    _validate_loss_fn_args(loss_fn)
+    head_lib._validate_loss_fn_args(loss_fn)  # pylint:disable=protected-access
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
@@ -371,9 +397,9 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         labels=processed_labels, logits=logits,
         expected_labels_dimension=self.logits_dimension)
     if self._loss_fn:
-      unweighted_loss = _call_loss_fn(
+      unweighted_loss = head_lib._call_loss_fn(  # pylint:disable=protected-access
           loss_fn=self._loss_fn, labels=processed_labels, logits=logits,
-          features=features)
+          features=features, expected_loss_dim=1)
     else:
       unweighted_loss = losses.sigmoid_cross_entropy(
           multi_class_labels=processed_labels, logits=logits,
@@ -555,52 +581,3 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                 threshold=threshold,
                 name=recall_key))
     return metric_ops
-
-
-def _validate_loss_fn_args(loss_fn):
-  """Validates loss_fn arguments.
-
-  Required arguments: labels, logits.
-  Optional arguments: features.
-
-  Args:
-    loss_fn: The loss function.
-  Raises:
-    ValueError: If the signature is unexpected.
-  """
-  loss_fn_args = util.fn_args(loss_fn)
-  for required_arg in ['labels', 'logits']:
-    if required_arg not in loss_fn_args:
-      raise ValueError(
-          'loss_fn must contain argument: {}. '
-          'Given arguments: {}'.format(required_arg, loss_fn_args))
-  invalid_args = list(set(loss_fn_args) - set(['labels', 'logits', 'features']))
-  if invalid_args:
-    raise ValueError('loss_fn has unexpected args: {}'.format(invalid_args))
-
-
-def _call_loss_fn(loss_fn, labels, logits, features):
-  """Calls loss_fn and checks the returned shape.
-
-  Args:
-    loss_fn: The loss function.
-    labels: Processed labels Tensor.
-    logits: Logits Tensor of shape [batch_size, logits_dimension].
-    features: Features dict.
-  Returns:
-    Loss Tensor with shape [batch_size, 1].
-  """
-  loss_fn_args = util.fn_args(loss_fn)
-  kwargs = {}
-  if 'features' in loss_fn_args:
-    kwargs['features'] = features
-  unweighted_loss = loss_fn(labels=labels, logits=logits, **kwargs)
-  batch_size = array_ops.shape(logits)[0]
-  loss_shape = array_ops.shape(unweighted_loss)
-  check_shape_op = control_flow_ops.Assert(
-      math_ops.reduce_all(math_ops.equal(loss_shape, [batch_size, 1])),
-      data=[
-          'loss_fn must return Tensor of shape [batch_size, 1]. Given: ',
-          loss_shape])
-  with ops.control_dependencies([check_shape_op]):
-    return array_ops.identity(unweighted_loss)
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 1adbd6f0fe32df4a513a2683d03fcefca07e2a42..43cdfec9689879201305385499b3b784e1593d60 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -381,8 +381,8 @@ class MultiLabelHead(test.TestCase):
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'loss_fn must return Tensor of shape \[batch_size, 1\]\. '
-          r'Given: \] \[2\]'):
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2\]'):
         actual_training_loss.eval()
 
   def test_eval_labels_none(self):
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index caa9dd83233b6b850385335fde96431271d85c3a..c9153c9352779631902fd17290fb0d94a5767cf0 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -457,6 +457,13 @@ def _get_local_devices(device_type):
 def _split_batch(features, labels, number_of_shards, device):
   """Split input features and labes into batches."""
 
+  def ensure_divisible_by_shards(sequence):
+    batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
+    if batch_size % number_of_shards != 0:
+      raise ValueError(
+          'Batch size {} needs to be divisible by the number of GPUs, which '
+          'is {}.'.format(batch_size, number_of_shards))
+
   def split_dictionary(dictionary):
     """Split a dictionary into shards."""
     shards = [{} for _ in range(number_of_shards)]
@@ -467,6 +474,7 @@ def _split_batch(features, labels, number_of_shards, device):
                 sp_input=tensor, num_split=number_of_shards, axis=0)):
           shards[i][name] = shard
       else:
+        ensure_divisible_by_shards(tensor)
         for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
           shards[i][name] = shard
     return shards
@@ -476,6 +484,7 @@ def _split_batch(features, labels, number_of_shards, device):
       if isinstance(features, dict):
         feature_shards = split_dictionary(features)
       else:
+        ensure_divisible_by_shards(features)
         feature_shards = array_ops.split(features, number_of_shards)
 
       if labels is None:
@@ -483,6 +492,7 @@ def _split_batch(features, labels, number_of_shards, device):
       elif isinstance(labels, dict):
         label_shards = split_dictionary(labels)
       else:
+        ensure_divisible_by_shards(labels)
         label_shards = array_ops.split(labels, number_of_shards)
   return feature_shards, label_shards
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index 03d31226af613960a19ce116b19b30153b1fdcee..6936f8a1312bc36ef9ad7a9a881e01733ec52a6e 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -433,6 +434,17 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
           'probabilities': np.array([[0.1], [0.02]])
       }, session.run(estimator_spec.predictions))
 
+  def test_batch_size_that_is_not_divisible_by_the_number_of_gpus(self):
+    features = np.array([[1.0], [2.0], [3.0]])
+    labels = np.array([[1.0], [2.0], [3.0]])
+
+    with self.assertRaisesRegexp(
+        ValueError, '.*Batch.+size.+needs.+to.+be.+divisible.+by.+GPUs.+'):
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/gpu:0', '/gpu:1'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
   def test_unsupported_loss_reduction(self):
     with self.assertRaisesRegexp(ValueError,
                                  '.+none.+reduction.+is.+specified.+'):
@@ -981,8 +993,13 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
     return list(map(evaluate_items, first_list)), list(
         map(evaluate_items, second_list))
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
   def test_simple_half_split(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -995,7 +1012,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
 
   def test_to_each_their_own(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1008,7 +1025,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
 
   def test_one_batch(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1021,7 +1038,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
 
   def test_half_split_in_dictionary(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
       labels = [10.0, 11.0, 12.0, 13.0]
 
@@ -1035,6 +1052,60 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([10.0, 11.0], label_shards[0].eval())
       self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
 
+  def test_sparse_tensor_can_be_split_unevenly(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 2], [2, 2]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 2]], values=[1., 2.], dense_shape=[2, 4]),
+          feature_shards[0]['x'].eval())
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 2]], values=[3.], dense_shape=[1, 4]),
+          feature_shards[1]['x'].eval())
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
+  def test_sparse_tensor_can_be_split_unevenly_repeated_row(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 0], [1, 1]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      print(feature_shards[0]['x'].eval())
+      print(feature_shards[1]['x'].eval())
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 0], [1, 1]],
+              values=[1., 2., 3.],
+              dense_shape=[2, 4]), feature_shards[0]['x'].eval())
+
+      second_batch = feature_shards[1]['x'].eval()
+      self.assertFalse(len(second_batch.indices))
+      self.assertFalse(len(second_batch.values))
+      self.assertAllEqual([1, 4], second_batch.dense_shape)
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
   def test_one_batch_in_dictionary(self):
     with self.test_session() as session:  # pylint: disable=unused-variable
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index fe86a20ab1f69a0eaf9d7486142451dac6337274..180f1b68f3b56113dfbbfc100bd04efc3bb8b31f 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -221,6 +221,7 @@ py_test(
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/ops/kmeans_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67512932
     deps = [
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
index 31d08bfb65ea49e1378ffba480771d38ce16abec..a8c5d0763c28ba2b54f217405f0da65533f26b91 100644
--- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -57,11 +57,11 @@ typedef Eigen::Map<
 
 class MaskedMatmulOp : public OpKernel {
  public:
-  explicit MaskedMatmulOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->MatchSignature(
-        {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL},
-        {DT_FLOAT}));
+  explicit MaskedMatmulOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(
+        context,
+        context->MatchSignature(
+            {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL}, {DT_FLOAT}));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -110,12 +110,11 @@ class MaskedMatmulOp : public OpKernel {
                                       num_nonzero_elements, 2);
 
     Tensor* prod_values_tensor;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0, TensorShape({num_nonzero_elements}),
-                       &prod_values_tensor));
-    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(),
-                                 1, num_nonzero_elements);
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({num_nonzero_elements}),
+                                &prod_values_tensor));
+    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(), 1,
+                                 num_nonzero_elements);
 
     auto get_a_index = [&indices_mat, &a_dim_0](int64 i) {
       int64 a_index = internal::SubtleMustCopy(indices_mat(i, 0));
@@ -182,8 +181,8 @@ class MaskedMatmulOp : public OpKernel {
       }
     };
     // Shard the work.
-    worker_threads.workers->ParallelFor(
-        num_nonzero_elements, cost_per_unit, work);
+    worker_threads.workers->ParallelFor(num_nonzero_elements, cost_per_unit,
+                                        work);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("MaskedMatmul").Device(DEVICE_CPU),
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 9a5413fc3f2642443621b33d325e3d8c893fd6ac..c861cfff544a78617aa1ace730b50c094cf16330 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -25,6 +25,7 @@ import time
 from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -32,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
@@ -141,7 +143,7 @@ class _ModelFn(object):
   def model_fn(self, features, mode, config):
     """Model function for the estimator.
 
-    Note that this does not take a `1abels` arg. This works, but `input_fn` must
+    Note that this does not take a `labels` arg. This works, but `input_fn` must
     return either `features` or, equivalently, `(features, None)`.
 
     Args:
@@ -207,6 +209,15 @@ class _ModelFn(object):
       training_hooks.append(
           _LossRelativeChangeHook(loss, self._relative_tolerance))
 
+    export_outputs = {
+        KMeansClustering.ALL_DISTANCES:
+            export_output.PredictOutput(all_distances[0]),
+        KMeansClustering.CLUSTER_INDEX:
+            export_output.PredictOutput(model_predictions[0]),
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(model_predictions[0])
+    }
+
     return model_fn_lib.EstimatorSpec(
         mode=mode,
         predictions={
@@ -216,7 +227,8 @@ class _ModelFn(object):
         loss=loss,
         train_op=training_op,
         eval_metric_ops={KMeansClustering.SCORE: metrics.mean(loss)},
-        training_hooks=training_hooks)
+        training_hooks=training_hooks,
+        export_outputs=export_outputs)
 
 
 # TODO(agarwal,ands): support sharded input.
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc
index d44032968d559bec14722902a4d47d22c46ea4aa..6f8ad486d10a825a277749157d68fa671b9f8d3a 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc
@@ -102,16 +102,12 @@ REGISTER_OP("DecodeVideo")
       return Status::OK();
     })
     .Doc(R"doc(
-Processes the contents of an audio file into a tensor using FFmpeg to decode
+Processes the contents of an video file into a tensor using FFmpeg to decode
 the file.
 
-One row of the tensor is created for each channel in the audio file. Each
-channel contains audio samples starting at the beginning of the audio and
-having `1/samples_per_second` time between them. If the `channel_count` is
-different from the contents of the file, channels will be merged or created.
-
-contents: The binary audio file contents, as a string or rank-0 string
-    tensor.
+contents: The binary contents of the video file to decode. This is a
+    scalar.
+output: A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
 )doc");
 
 }  // namespace ffmpeg
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index c85b1837ab5b0c1a3cea0525918f7717228d2fab..e61221a6b0d34373279a379f356c99c379488182 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -47,20 +47,19 @@ std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
                                            int32 channel_count,
                                            const string& stream) {
   std::vector<string> command({
-          "-nostats",             // No additional progress display.
-          "-nostdin",             // No interactive commands accepted.
-          "-f", input_format_id,  // eg: "mp3"
-          "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
-          "-loglevel", "error",   // Print errors only.
-          "-hide_banner",         // Skip printing build options, version, etc.
-          "-map_metadata", "-1",  // Copy global metadata from input to output.
-          "-vn",                  // No video recording.
-          "-ac:a:0", StrCat(channel_count), "-ar:a:0",
-          StrCat(samples_per_second),
-          // Output set (in several ways) to signed 16-bit little-endian ints.
-          "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
-          "-sn",  // No subtitle recording.
-          "-y"   // Overwrite output file.
+      "-nostats",             // No additional progress display.
+      "-nostdin",             // No interactive commands accepted.
+      "-f", input_format_id,  // eg: "mp3"
+      "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
+      "-loglevel", "error",   // Print errors only.
+      "-hide_banner",         // Skip printing build options, version, etc.
+      "-map_metadata", "-1",  // Copy global metadata from input to output.
+      "-vn",                  // No video recording.
+      "-ac:a:0", StrCat(channel_count), "-ar:a:0", StrCat(samples_per_second),
+      // Output set (in several ways) to signed 16-bit little-endian ints.
+      "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
+      "-sn",  // No subtitle recording.
+      "-y"    // Overwrite output file.
   });
   if (!stream.empty()) {
     command.emplace_back("-map");
@@ -75,21 +74,13 @@ std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
                                            const string& output_filename) {
   return {"-nostats",  // No additional progress display.
           "-nostdin",  // No interactive commands accepted.
-          "-i",
-          input_filename,
-          "-f",
-          "image2pipe",
-          "-probesize",
-          StrCat(kDefaultProbeSize),
-          "-loglevel",
+          "-i", input_filename, "-f", "image2pipe", "-probesize",
+          StrCat(kDefaultProbeSize), "-loglevel",
           // Info is needed to get the information about stream, etc.
           // It is generated to a separate file, not stdout/stderr.
           "info",
           "-hide_banner",  // Skip printing build options, version, etc.
-          "-vcodec",
-          "rawvideo",
-          "-pix_fmt",
-          "rgb24",
+          "-vcodec", "rawvideo", "-pix_fmt", "rgb24",
           "-y",  // Overwrite output file.
           StrCat(output_filename)};
 }
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 85b61b26163d87a10d4e316720b4f633e038bbec..05728b3d37570d06f2f8af67e3b0612d21d07601 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -32,10 +32,8 @@ namespace tensorflow {
 namespace ffmpeg {
 namespace {
 
-const char kTestWavFilename[] =
-    "contrib/ffmpeg/testdata/mono_10khz.wav";
-const char kTestMp3Filename[] =
-    "contrib/ffmpeg/testdata/test_sound1.mp3";
+const char kTestWavFilename[] = "contrib/ffmpeg/testdata/mono_10khz.wav";
+const char kTestMp3Filename[] = "contrib/ffmpeg/testdata/test_sound1.mp3";
 
 // Set to true via a command line flag iff the test is expected to have FFmpeg
 // installed.
@@ -139,7 +137,7 @@ TEST(FfmpegLibTest, TestRoundTripWav) {
 }  // namespace ffmpeg
 }  // namespace tensorflow
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   tensorflow::string usage = tensorflow::ffmpeg::ParseTestFlags(&argc, argv);
   testing::InitGoogleTest(&argc, argv);
   if (argc != 1) {
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
index 36fc71794b06e0f3cb86c40b325ce50e8999c667..d6c885a32424334bfc28c830e3701f219aa244ee 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
@@ -20,8 +20,6 @@
 #include <string>
 #include <vector>
 
-
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 673c51784229bd88011f8b33fb851a2885566220..503b868aaa67b26faaad9eb82f6b864d896a44af 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -53,6 +53,7 @@ See the @{$python/contrib.framework} guide.
 @@assign_from_values_fn
 @@create_global_step
 @@filter_variables
+@@fuse_op
 @@get_global_step
 @@get_or_create_global_step
 @@get_local_variables
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
index 6677dca752f84fc1ba7548b7739df04b7aaf14f7..5bf6b67529579e71a615c27e035111a58d5c02e0 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/framework/kernels/zero_initializer_op.h"
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -81,8 +81,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #undef REGISTER_KERNELS
 
-} // namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.h b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
index 14c9268efa869ffd48b01dd2add44990ef7a43f8..99389a5ab6aa73c2ab0e522dd0f9fbc7093c8f4a 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.h
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
@@ -29,5 +29,5 @@ struct TensorSetZero {
 };
 }  // namespace functor
 
-} // end namespace tensorflow
-#endif // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc
index 1ee8e1498cf07559fe3db78ef832e2cdf26bea1c..706134ba9a51de6253ba7463b17ff662ea740ed0 100644
--- a/tensorflow/contrib/framework/ops/variable_ops.cc
+++ b/tensorflow/contrib/framework/ops/variable_ops.cc
@@ -26,8 +26,8 @@ REGISTER_OP("ZeroInitializer")
     .Attr("T: realnumbertype")
     .SetAllowsUninitializedInput()
     .SetShapeFn([](InferenceContext* c) {
-        c->set_output(0, c->input(0));
-        return Status::OK();
+      c->set_output(0, c->input(0));
+      return Status::OK();
     })
     .Doc(R"doc(
 Initialize 'ref' with all zeros. This op requires that the tensor is not
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index 2effe8eb26e98caa2707315d5f2e0e530ead31d3..8cdb340f2ddd9b3a7f55c1937ef045f4627e99be 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -77,6 +78,7 @@ class AssertScalarIntTest(test.TestCase):
               [3, 4], dtype=dtypes.int32))
 
 
+@test_util.with_c_api
 class WithShapeTest(test.TestCase):
 
   def _assert_with_shape(self, tensor, expected_value, expected_shape,
@@ -213,16 +215,25 @@ class WithShapeTest(test.TestCase):
       tensor_partial_shape.set_shape([None, 2])
 
       for incompatible_shape in [[0], [1]]:
+        if ops._USE_C_API:
+          error_message = "Shapes must be equal rank, but are 2 and 1"
+        else:
+          error_message = r"Shapes \(\?, 2\) and \([01],\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, r"Shapes \(\?, 2\) and \([01],\) are not compatible",
+            ValueError, error_message,
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
       for incompatible_shape in [[1, 2, 1]]:
         self.assertRaisesRegexp(ValueError, "Dimensions must be equal",
                                 tensor_util.with_shape, incompatible_shape,
                                 tensor_partial_shape)
       for incompatible_shape in [[2, 1]]:
+        if ops._USE_C_API:
+          error_message = (r"Dimension 1 in both shapes must be equal, but are "
+                           r"2 and 1. Shapes are \[\?,2\] and \[2,1\].")
+        else:
+          error_message = r"Shapes \(\?, 2\) and \(2, 1\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, r"Shapes \(\?, 2\) and \(2, 1\) are not compatible",
+            ValueError, error_message,
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
 
       compatible_shape = [2, 2]
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
index b5e9f8df79262635bf579a6bf2260bc40c140c6f..6f65fe771eb77c10d0914faa90886b587adae68c 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
   """Tests of the new, differentiable version of accumulate_n"""
 
@@ -62,8 +61,9 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
         accum_n = av2.accumulate_n_v2(input_vars)
         sess.run(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
-        self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
-                            [g.eval() for g in accum_n_grad])
+        self.assertAllEqual(
+            np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
+            [g.eval() for g in accum_n_grad])
 
   # The tests below used to be in a separate class under cwise_ops_test.py,
   # which did not run in the default test target.
@@ -75,8 +75,8 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = av2.accumulate_n_v2(random_tensors)
       np_val = random_arrays[0]
@@ -95,21 +95,21 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         a = variables.Variable(0.2)
         b = variables.Variable(0.1)
-        tf_val = av2.accumulate_n_v2([a,b], shape=[2,2]) # Should be shape=[]
+        tf_val = av2.accumulate_n_v2([a, b], shape=[2, 2])  # Should be shape=[]
 
   def testIncompatibleShapes(self):
     with self.test_session():
       with self.assertRaises(ValueError):
-        a = variables.Variable(np.array([0.1,0.2]))
-        b = variables.Variable(np.array([[0.3],[0.4]]))
-        tf_val = av2.accumulate_n_v2([a,b])
+        a = variables.Variable(np.array([0.1, 0.2]))
+        b = variables.Variable(np.array([[0.3], [0.4]]))
+        tf_val = av2.accumulate_n_v2([a, b])
 
   def testWrongType(self):
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
         b = variables.Variable(0.1, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32)
+        tf_val = av2.accumulate_n_v2([a, b], tensor_dtype=np.int32)
 
   def testWrongTypeOneInput(self):
     # Scenario that used to trigger a bug, even when testWrongType() worked
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 2bce00fde2459878a12027bb4d98bd3818bc92a2..409657fe1da0e5540cd2ad6070d86737c039e91f 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -53,7 +53,8 @@
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
 
-  Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:
+  Example of how to use tf.contrib.framework.add_arg_scope to enable your
+  function to be called within an arg_scope later:
 
   @tf.contrib.framework.add_arg_scope
   def conv2d(*args, **kwargs)
@@ -65,11 +66,10 @@ from __future__ import print_function
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 
-__all__ = ['arg_scope',
-           'add_arg_scope',
-           'current_arg_scope',
-           'has_arg_scope',
-           'arg_scoped_arguments']
+__all__ = [
+    'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope',
+    'arg_scoped_arguments'
+]
 
 _ARGSTACK = [{}]
 
@@ -172,6 +172,7 @@ def add_arg_scope(func):
   Returns:
     A tuple with the decorated function func_with_args().
   """
+
   def func_with_args(*args, **kwargs):
     current_scope = current_arg_scope()
     current_args = kwargs
@@ -180,6 +181,7 @@ def add_arg_scope(func):
       current_args = current_scope[key_func].copy()
       current_args.update(kwargs)
     return func(*args, **current_args)
+
   _add_op(func)
   setattr(func_with_args, '_key_op', _key_op(func))
   return tf_decorator.make_decorator(func, func_with_args)
diff --git a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
index 6a56237f67c844a3daa546eb02d64c9e2658f639..bafd1d59418f0ba47ebbdaabbf06f8e5471fc1a1 100644
--- a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
@@ -25,13 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-// Return the string containing the list of valid activation modes, that can be
-// used as an Attr() in REGISTER_OP.
-string GetAllActivationModeAttrString() { return "activation_mode: {'Relu'}"; }
-
-}  // namespace
-
 // --------------------------------------------------------------------------
 
 // TODO(pauldonnelly): Add support for double inputs and scales to this Op,
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
index a65d4bc50ff796977e8ea7f652b7cbe3fe37f673..96cdd8b1ca4d56d12d38ea961ae73f3a3aa28968 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
@@ -116,7 +116,7 @@ def build_fused_conv_bias_relu_graph(device, input_shape, filter_shape, strides,
     for _ in range(1, num_iters):
       with ops.control_dependencies([fused_out]):
         # pylint: disable=g-line-too-long
-        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(  # pylint: disable=line-too-long
             inp,
             filt,
             bias,
@@ -166,10 +166,10 @@ class FusedConv2DBiasActivationBenchmark(test.Benchmark):
         duration = (time.time() - start_time) / num_iters
 
         print("%s inputshape:%s filtershape:%s strides:%s padding:%s "
-              "%d iters: %.8f sec" %
-              (device, str(input_shape).replace(" ", ""),
-               str(filter_shape).replace(" ", ""),
-               str(strides).replace(" ", ""), padding, num_iters, duration))
+              "%d iters: %.8f sec" % (device, str(input_shape).replace(" ", ""),
+                                      str(filter_shape).replace(" ", ""),
+                                      str(strides).replace(" ", ""), padding,
+                                      num_iters, duration))
     name_template = (
         "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_"
         "strides_{strides}_padding_{padding}")
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 0d51c282a8977871185fb4200082feb7868cdbae..082c42eba180917e732bb7890129dfa94bf00fec 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -59,7 +59,11 @@ _summary_type_map = {
 class GANEstimator(estimator.Estimator):
   """An estimator for Generative Adversarial Networks (GANs).
 
-  This Estimator is backed by TFGAN.
+  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
+  except for one exception: if either `generator_fn` or `discriminator_fn` have
+  an argument called `mode`, then the tf.Estimator mode is passed in for that
+  argument. This helps with operations like batch normalization, which have
+  different train and evaluation behavior.
 
   Example:
 
@@ -233,9 +237,11 @@ def _gan_model_fn(
 def _make_gan_model(generator_fn, discriminator_fn, real_data,
                     generator_inputs, generator_scope, add_summaries, mode):
   """Make a `GANModel`, and optionally pass in `mode`."""
-  # If `generator_fn` has an argument `mode`, pass mode to it.
+  # If network functions have an argument `mode`, pass mode to it.
   if 'mode' in inspect.getargspec(generator_fn).args:
     generator_fn = functools.partial(generator_fn, mode=mode)
+  if 'mode' in inspect.getargspec(discriminator_fn).args:
+    discriminator_fn = functools.partial(discriminator_fn, mode=mode)
   gan_model = tfgan_train.gan_model(
       generator_fn,
       discriminator_fn,
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index e752f0bcccda418b79d4fdabb27807394cbbb425..387a62bd741bd42c03dc1bf70592060c29ccd7a8 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -54,7 +54,8 @@ def generator_fn(noise_dict, mode):
   return layers.fully_connected(noise, noise.shape[1].value)
 
 
-def discriminator_fn(data, _):
+def discriminator_fn(data, unused_conditioning, mode):
+  del unused_conditioning, mode
   return layers.fully_connected(data, 1)
 
 
@@ -99,7 +100,6 @@ def mock_head(testcase, expected_generator_inputs, expected_real_data,
     else:
       testcase.assertEqual(discriminator_scope_name,
                            gan_model.discriminator_scope.name)
-    testcase.assertEqual(_or_none(discriminator_fn), gan_model.discriminator_fn)
 
     with ops.control_dependencies(assertions):
       if mode == model_fn_lib.ModeKeys.TRAIN:
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 986a5ff6dcbeb2ff996f49137adc6d34e14c979f..d9b07e62f89d61c72a34dfa844f11ad1238fb006 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -28,6 +28,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 import sys
 import tarfile
 
@@ -189,20 +190,31 @@ def get_graph_def_from_resource(filename):
   return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename))
 
 
-def get_graph_def_from_url_tarball(url, filename):
-  """Get a GraphDef proto from a tarball on the web."""
-  def _progress(count, block_size, total_size):
-    sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-        url, float(count * block_size) / float(total_size) * 100.0))
-    sys.stdout.flush()
-  tar_filename, _ = urllib.request.urlretrieve(url, reporthook=_progress)
+def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
+  """Get a GraphDef proto from a tarball on the web.
+
+  Args:
+    url: Web address of tarball
+    filename: Filename of graph definition within tarball
+    tar_filename: Temporary download filename (None = always download)
+
+  Returns:
+    A GraphDef loaded from a file in the downloaded tarball.
+  """
+  if not (tar_filename and os.path.exists(tar_filename)):
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' % (
+          url, float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+    tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress)
   with tarfile.open(tar_filename, 'r:gz') as tar:
     proto_str = tar.extractfile(filename).read()
   return graph_pb2.GraphDef.FromString(proto_str)
 
 
 def _default_graph_def_fn():
-  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH)
+  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH,
+                                        os.path.basename(INCEPTION_URL))
 
 
 def run_inception(images,
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index 7d2a7a254f6656198e47325dbb351618d85d147c..56ac45554da3633149a61155a416fa7cb6cff553 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -620,7 +620,7 @@ class CombineAdversarialLossTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       for _ in range(10):  # spot check closeness on more than one sample.
         gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
-        self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
+        self.assertNear(gnorm_np, precond_gnorm_np, 1e-4)
 
 
 class CycleConsistencyLossTest(test.TestCase):
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 5c7ac744289ab7729b4cc43ab9bedc9342284e65..81e70ae30a4c72dbcedd1aabfe758ecca4c8b366 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -86,8 +86,9 @@ int TryToReadNumaNode(ibv_device* device) {
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
       LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       return 0;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -290,8 +291,8 @@ Status GdrMemoryManager::Init() {
   // Host memory allocators
   for (Allocator* allocator : allocators) {
     auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator) << "is not visitable for instrumentation"
-                               << allocator->Name();
+    CHECK(visitable_allocator)
+        << "is not visitable for instrumentation" << allocator->Name();
     // Make sure we don't instrument the same allocator twice
     if (instrumented_.find(allocator) == std::end(instrumented_)) {
       visitable_allocator->AddAllocVisitor(alloc_visitor);
@@ -635,8 +636,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
     } else {
       checksum = GPUUtil::Checksum(*tensor);
     }
-    CHECK(checksum == remote_mr.checksum()) << "Checksum mismatch: " << checksum
-                                            << "!=" << remote_mr.checksum();
+    CHECK(checksum == remote_mr.checksum())
+        << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
 #endif
   }
   done(Status::OK());
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 5a6f2f3086d708e5264b0483c211902ac8dce5f6..cb3a1087de5b83e7737e0dac2c04fbe152a9d676 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -1,60 +1,67 @@
 # TensorFlow Runtime with HVX Acceleration
 
-## Description
+This README explain how to build and use the TensorFlow runtime with HVX Acceleration. HVX is an extension of Hexagon, a DSP provided by Qualcomm, which can compute vector calculations faster using less energy than ARM processors.
 
-This README explain how to build and use the TensorFlow Runtime with HVX Acceleration. HVX is an extension of Hexagon which is a DSP provided by qualcomm which can compute vector calculations faster using lower energy than ARM processors.
+## Dependencies
+
+* [Android SDK](https://developer.android.com/studio/index.html).
+* [Android NDK](https://developer.android.com/ndk/index.html). Save the path in `${NDK_ROOT}`.
+* A rooted Qualcomm-based Android device connected to the computer (preferably, a [Snapdragon Development Board](https://developer.qualcomm.com/hardware/additional-snapdragon), but it could be a rooted phone with a Qualcomm SoC, albeit this guide may not work with it). The device needs to be rooted for development and testing purposes, and shouldn't be needed in production. See [Behold, The Snapdragon MDP](https://developer.qualcomm.com/blog/behold-snapdragon-mdp) for more information.
+* [Hexagon SDK v3.0](https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools). Save the path in `${QUALCOMM_SDK}`.
+* The current directory should be TensorFlow source code (`git clone https://github.com/tensorflow/tensorflow.git && cd tensorflow`), and saved into `${TF_ROOT_DIR}`.
+
+You may also need to add a test signature in the device to run HVX-based binaries. Follow the instructions in `${QUALCOMM_SDK}/docs/Tools_Signing.html`, using Python 2.
+
+Note that if the device is not rooted, you may not be able to get the serial number, push the test signature and/or run binary files that call HVX libraries.
 
 ## Quick Start Guide
 
-We provides several tools to build and run inference with this runtime quickly.
+We provide several tools to build and run inference with this runtime quickly.
 
-#### All-in-one script to run inception model with prebuild hexagon library
-If you don’t need to build your own implementation of hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+### Run inception model with a prebuilt Hexagon library
 
+If you don’t need to build your own implementation of Hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh -p
 ```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/build_all_android.sh -X
-```
-(-X downloads dependencies to hexagon HVX and graphs, and copy all dependencies to android and execute a test)
 
-#### All-in-one script to run inception model by building entire libraries from source code
- If you want to build your own implementation of hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads source and build everything for hexagon.
+The `-p` option makes the script download dependencies (i.e., Hexagon HVX binaries and graphs models), copy them to the Android device and execute a test.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-QUALCOMM_SDK="/path/to/qualcomm/sdk" NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+### Run inception model by building all from the source code
+
+If you want to build your own implementation of Hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads the source and builds everything that's necessary.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
 ```
 
 ## Building libraries
 
 If you've finished walking through the quick start guide, you may want to try building each binary manually.
 
-#### Build libhexagon_nn_skel.so
-Download hexagon nn library from codeaurora.org and build it.
+### Build libhexagon\_nn\_skel.so
 
-```
+Download Hexagon NN library from codeaurora.org and build it.
+
+```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
 ```
 
-(Just follow instructions in README.HOW_TO_BUILD. You can find libhexagon_nn_skel.so in hexagon_Release_dynamic_toolv72_v60/ship)
-Then copy the generated binary to GEN_LIBS_DIR
+Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
+Then copy the generated binary to `${GEN_LIBS_DIR}`.
 
-```
+```shell
 GEN_LIBS_DIR="/path/to/a/dir/to/store/hexagon/libraries"
 cp -v "hexagon_Release_dynamic_toolv72_v60/ship/libhexagon_nn_skel.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build libhexagon_controller.so
+### Build libhexagon\_controller.so
+
 Download tensorflow and build hexagon controller.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-TF_ROOT_DIR="$(pwd)"
-QUALCOMM_SDK="/path/to/qualcomm/sdk"
+```shell
 GENERATED_NNLIB_DIRECTORY="/path/to/nnlib"
 GENERATED_HEXAGON_CONTROLLER_DIRECTORY="${QUALCOMM_SDK}/examples/common/generated_hexagon_controller"
 rm -rf "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
@@ -70,12 +77,12 @@ make tree VERBOSE=1 V=android_Release
 cp -v "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/android_Release/ship/libhexagon_controller.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build tensorflow linking hexagon library
-Build tensorflow with the build_all_android.sh with specifying -x option.
+### Build TensorFlow linking Hexagon library
 
-```
+Build TensorFlow with `build_all_android.sh` specifying the `-x` option.
+
+```shell
 BUILD_ALL_ANDROID_PATH="${TF_ROOT_DIR}/tensorflow/contrib/makefile/build_all_android.sh"
-NDK_ROOT="/path/to/ndk/root"
 
 CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -x "${GEN_LIBS_DIR}" \
@@ -83,11 +90,11 @@ CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -t hexagon_graph_execution
 ```
 
-#### Push binaries to your Android device
+### Push binaries to your Android device
 
 Before running tests on your Android device, you need to push several binaries to it.
 
-```
+```shell
 adb push "${GEN_LIBS_DIR}/libhexagon_controller.so" "/data/local/tmp"
 adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 adb push -p \
@@ -100,40 +107,54 @@ adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
 adb wait-for-device
 ```
 
-#### Run tests on the device
+### Run tests on the device
 
 Finally, you can run the inference tests on your device.
 
-```
+```shell
 adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
 "/data/local/tmp/hexagon_graph_execution"
 ```
 
-#### Troubleshooting
-If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
+### Troubleshooting
+
+#### Testsig issue
+
+If you're using the Open-Q 820 Snapdragon Development Kit, you may run into an issue with running the executable due to a missing `testsig` library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
 
-If the lack of a testsig library is your problem, you will see errors of the type:
+If the lack of a `testsig` library is your problem, you will see errors of the type:
 `vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))`
-appearing in adb logcat.
-
-There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library:
-1. Run adb as root: `adb root`
-2. Run the command `adb shell cat /sys/devices/soc0/serial_number`
-3. Convert the decimal number you get as output to hex
-4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)`
-5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so
-6. Push the shared library to your device:
+appearing in `adb logcat` or ["Expected: (version) >= (1), actual: 0 vs 1" while running a binary from adb](https://github.com/tensorflow/tensorflow/issues/11210).
+
+You need to add a test signature, as described at the beginning of this README. After rebooting your device, you should be able to run the sample application.
+
+#### Qualcomm SDK Linux installation fails with "Malformed \uxxxx encoding"
+
+The installation file is based on LaunchAnywhere, which fails in Linux if the `PS1` env variable contains non-common Unicode chars:
+
 ```
-adb root
-adb wait-for-device
-adb remount
-adb wait-for-device
-adb shell mkdir /system/lib/rfsa
-adb shell mkdir /system/lib/rfsa/adsp
-adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/
+Preparing to install...
+Extracting the JRE from the installer archive...
+Unpacking the JRE...
+Extracting the installation resources from the installer archive...
+Configuring the installer for this system's environment...
+
+Launching installer...
+
+An internal LaunchAnywhere application error has occured and this application cannot proceed. (LAX)
+
+Stack Trace:
+java.lang.IllegalArgumentException: Malformed \uxxxx encoding.
+	at java.util.Properties.loadConvert(Properties.java:574)
+	at java.util.Properties.load0(Properties.java:391)
+	at java.util.Properties.load(Properties.java:317)
+	at com.zerog.common.java.util.PropertiesUtil.loadProperties(Unknown Source)
+	at com.zerog.lax.LAX.<init>(Unknown Source)
+	at com.zerog.lax.LAX.main(Unknown Source)
 ```
 
-After rebooting your device, you should be able to run the sample application.
+It can be solved by temporarily assigning the `PS1` environment variable to something simple, such as '$'.
+
+## Maintainers
 
-Maintainers:
-- Satoshi Kataoka (satok@google.com, github.com/satok16)
+* Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 6adf837ca0ab506bd18f5e2e1fc1847e31d782bf..c2e32da133b32c8fe169302668031af8bace2c22 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -43,9 +43,9 @@ template struct FillProjectiveTransform<CPUDevice, double>;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
+using generator::Interpolation;
 using generator::INTERPOLATION_BILINEAR;
 using generator::INTERPOLATION_NEAREST;
-using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -72,11 +72,12 @@ class ImageProjectiveTransform : public OpKernel {
     const Tensor& transform_t = ctx->input(1);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
-    OP_REQUIRES(ctx, (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
-                      (transform_t.dim_size(0) == images_t.dim_size(0) ||
-                       transform_t.dim_size(0) == 1) &&
-                      transform_t.dim_size(1) ==
-                          ProjectiveGenerator<Device, T>::kNumParameters),
+    OP_REQUIRES(ctx,
+                (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
+                 (transform_t.dim_size(0) == images_t.dim_size(0) ||
+                  transform_t.dim_size(0) == 1) &&
+                 transform_t.dim_size(1) ==
+                     ProjectiveGenerator<Device, T>::kNumParameters),
                 errors::InvalidArgument(
                     "Input transform should be num_images x 8 or 1 x 8"));
     auto images = images_t.tensor<T, 4>();
diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
index 9f0bf37aed3fc9aeefb7602ef3fda4cfd76f1917..8f9a5c28039b74a874028826ca8a6d5a36ab7cf4 100755
--- a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
@@ -143,8 +143,8 @@ class SingleImageRandomDotStereogramsOp : public OpKernel {
     }
 
     data_box_left = deltaX_border_image / 2;  // Center DATA in X dimension
-    data_box_width = data_Xwindow;             // width of scan line
-    data_box_height = data_Ywindow;            // hight of image
+    data_box_width = data_Xwindow;            // width of scan line
+    data_box_height = data_Ywindow;           // hight of image
 
     const T* inputZ = input_tensor.flat<T>().data();  // Flatten input Z buffer
 
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 1f41f243f2ebc0d1e884728defa160bf6d6c34ce..8139d4272d6950815bd39a64e86e0f7422e6f799 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -58,7 +58,9 @@ REGISTER_OP("SingleImageRandomDotStereograms")
       int colors;
       TF_RETURN_IF_ERROR(c->GetAttr("number_colors", &colors));
 
-      c->set_output(0, c->MakeShape({y_dim, x_dim, colors > 256? c->MakeDim(3) : c->MakeDim(1)}));
+      c->set_output(
+          0, c->MakeShape(
+                 {y_dim, x_dim, colors > 256 ? c->MakeDim(3) : c->MakeDim(1)}));
       return Status::OK();
     })
     .Doc(R"doc(
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index bb766e59d2cee648042cc08be466796d9233ad66..d4a6a5bcbb52511d4093587814100b2a0e8b2420 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -26,18 +26,20 @@ _sirds_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile(
         "_single_image_random_dot_stereograms.so"))
 
-def single_image_random_dot_stereograms(
-    depth_values,
-    hidden_surface_removal=None,
-    convergence_dots_size=None,
-    dots_per_inch=None,
-    eye_separation=None, mu=None,
-    normalize=None, normalize_max=None,
-    normalize_min=None,
-    border_level=None,
-    number_colors=None,
-    output_image_shape=None,
-    output_data_window=None):
+
+def single_image_random_dot_stereograms(depth_values,
+                                        hidden_surface_removal=None,
+                                        convergence_dots_size=None,
+                                        dots_per_inch=None,
+                                        eye_separation=None,
+                                        mu=None,
+                                        normalize=None,
+                                        normalize_max=None,
+                                        normalize_min=None,
+                                        border_level=None,
+                                        number_colors=None,
+                                        output_image_shape=None,
+                                        output_data_window=None):
   """Output a RandomDotStereogram Tensor for export via encode_PNG/JPG OP.
 
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
@@ -45,7 +47,8 @@ def single_image_random_dot_stereograms(
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
   corrupt the encode 3-D data witin the image.
 
-  Based upon [this paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
+  Based upon [this
+  paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
 
   This outputs a SIRDS image as picture_out.png:
 
@@ -113,7 +116,8 @@ def single_image_random_dot_stereograms(
       hidden_surface_removal=hidden_surface_removal,
       convergence_dots_size=convergence_dots_size,
       dots_per_inch=dots_per_inch,
-      eye_separation=eye_separation, mu=mu,
+      eye_separation=eye_separation,
+      mu=mu,
       normalize=normalize,
       normalize_max=normalize_max,
       normalize_min=normalize_min,
@@ -123,4 +127,5 @@ def single_image_random_dot_stereograms(
       output_data_window=output_data_window)
   return result
 
+
 ops.NotDifferentiable("SingleImageRandomDotStereograms")
diff --git a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
index ca288c1f737d25faac678f5c199d5c1e49f721cb..886f6798150c57d8066546b0919481d3878882fc 100644
--- a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
+++ b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
@@ -34,9 +34,8 @@ class ObtainNextOp : public OpKernel {
 
     // Allocate output.
     Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx,
-        ctx->allocate_output("out_element", TensorShape({}), &output_tensor));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("out_element", TensorShape({}),
+                                             &output_tensor));
 
     // Obtain mutex for the "counter" tensor.
     mutex* mu;
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f7593aa462c4ca86d0ffc4f065e1aae849364561
--- /dev/null
+++ b/tensorflow/contrib/kafka/BUILD
@@ -0,0 +1,104 @@
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_kernel_library(
+    name = "kafka_kernels",
+    srcs = ["kernels/kafka_dataset_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:dataset",
+        "//third_party/eigen3",
+        "@kafka//:kafka",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["kafka_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_kafka_ops",
+    out = "python/ops/gen_kafka_ops.py",
+    require_shape_functions = True,
+    deps = [":kafka_ops_op_lib"],
+)
+
+py_library(
+    name = "kafka",
+    srcs = [
+        "__init__.py",
+        "python/ops/kafka_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gen_kafka_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+# The Kafka server has to be setup before running the test.
+# The Kafka server is setup through Docker so the Docker engine
+# has to be installed.
+#
+# Once the Docker engine is ready:
+# To setup the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh start kafka
+#
+# After the test is complete:
+# To team down the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh stop kafka
+tf_py_test(
+    name = "kafka_test",
+    srcs = ["python/kernel_tests/kafka_test.py"],
+    additional_deps = [
+        ":kafka",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/kafka/__init__.py b/tensorflow/contrib/kafka/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d755c40568dfa2f7f6f617cf3180268837a5ca0
--- /dev/null
+++ b/tensorflow/contrib/kafka/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset.
+
+@@KafkaDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops.kafka_dataset_ops import KafkaDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "KafkaDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88ef5f357113372b0a2d0cb13382ac980a61252d
--- /dev/null
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -0,0 +1,321 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/tensor.h"
+
+#include "src-cpp/rdkafkacpp.h"
+
+namespace tensorflow {
+
+class KafkaDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* topics_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("topics", &topics_tensor));
+    OP_REQUIRES(
+        ctx, topics_tensor->dims() <= 1,
+        errors::InvalidArgument("`topics` must be a scalar or a vector."));
+
+    std::vector<string> topics;
+    topics.reserve(topics_tensor->NumElements());
+    for (int i = 0; i < topics_tensor->NumElements(); ++i) {
+      topics.push_back(topics_tensor->flat<string>()(i));
+    }
+
+    std::string servers = "";
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<std::string>(ctx, "servers", &servers));
+    std::string group = "";
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "group", &group));
+    bool eof = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "eof", &eof));
+    int64 timeout = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "timeout", &timeout));
+    OP_REQUIRES(ctx, (timeout > 0),
+                errors::InvalidArgument(
+                    "Timeout value should be large than 0, got ", timeout));
+    *output = new Dataset(ctx, std::move(topics), servers, group, eof, timeout);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> topics,
+            const string& servers, const string& group, const bool eof,
+            const int64 timeout)
+        : GraphDatasetBase(ctx),
+          topics_(std::move(topics)),
+          servers_(servers),
+          group_(group),
+          eof_(eof),
+          timeout_(timeout) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Kafka")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "KafkaDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* topics = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics));
+      Node* servers = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(servers_, &servers));
+      Node* group = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(group_, &group));
+      Node* eof = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(eof_, &eof));
+      Node* timeout = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(timeout_, &timeout));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {topics, servers, group, eof, timeout}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a topic, so try to read the next line.
+          if (consumer_.get()) {
+            while (true) {
+              if (limit_ >= 0 &&
+                  (topic_partition_->offset() >= limit_ || offset_ >= limit_)) {
+                // EOF current topic
+                break;
+              }
+              std::unique_ptr<RdKafka::Message> message(
+                  consumer_->consume(dataset()->timeout_));
+              if (message->err() == RdKafka::ERR_NO_ERROR) {
+                // Produce the line as output.
+                Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+                line_tensor.scalar<string>()() =
+                    std::string(static_cast<const char*>(message->payload()),
+                                message->len());
+                out_tensors->emplace_back(std::move(line_tensor));
+                *end_of_sequence = false;
+                // Sync offset
+                offset_ = message->offset();
+                return Status::OK();
+              }
+
+              if (message->err() == RdKafka::ERR__PARTITION_EOF &&
+                  dataset()->eof_) {
+                // EOF current topic
+                break;
+              }
+              if (message->err() != RdKafka::ERR__TIMED_OUT) {
+                return errors::Internal("Failed to consume:",
+                                        message->errstr());
+              }
+              message.reset(nullptr);
+              consumer_->poll(0);
+            }
+
+            // We have reached the end of the current topic, so maybe
+            // move on to next topic.
+            ResetStreamsLocked();
+            ++current_topic_index_;
+          }
+
+          // Iteration ends when there are no more topic to process.
+          if (current_topic_index_ == dataset()->topics_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_topic_index"),
+                                               current_topic_index_));
+
+        // `consumer_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All topics have been read and iterator has been exhausted.
+        if (consumer_.get()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("current_pos"), offset_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        ResetStreamsLocked();
+        int64 current_topic_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_topic_index"),
+                                              &current_topic_index));
+        current_topic_index_ = size_t(current_topic_index);
+        // The key "current_pos" is written only if the iterator was saved
+        // with an open topic.
+        if (reader->Contains(full_name("current_pos"))) {
+          int64 current_pos;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+          topic_partition_->set_offset(current_pos);
+          if (topic_partition_->offset() != current_pos) {
+            return errors::Internal("Failed to restore to offset ",
+                                    current_pos);
+          }
+          offset_ = current_pos;
+        }
+        return Status::OK();
+      }
+
+     private:
+      // Sets up Kafka streams to read from the topic at
+      // `current_topic_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_topic_index_ >= dataset()->topics_.size()) {
+          return errors::InvalidArgument(
+              "current_topic_index_:", current_topic_index_,
+              " >= topics_.size():", dataset()->topics_.size());
+        }
+
+        // Actually move on to next topic.
+        string entry = dataset()->topics_[current_topic_index_];
+
+        std::vector<string> parts = str_util::Split(entry, ":");
+        if (parts.size() < 1) {
+          return errors::InvalidArgument("Invalid parameters: ", entry);
+        }
+        string topic = parts[0];
+        int32 partition = 0;
+        if (parts.size() > 1) {
+          if (!strings::safe_strto32(parts[1], &partition)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+        int64 offset = 0;
+        if (parts.size() > 2) {
+          if (!strings::safe_strto64(parts[2], &offset)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        topic_partition_.reset(
+            RdKafka::TopicPartition::create(topic, partition, offset));
+
+        offset_ = topic_partition_->offset();
+        limit_ = -1;
+        if (parts.size() > 3) {
+          if (!strings::safe_strto64(parts[3], &limit_)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        std::unique_ptr<RdKafka::Conf> conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL));
+        std::unique_ptr<RdKafka::Conf> topic_conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_TOPIC));
+
+        std::string errstr;
+
+        RdKafka::Conf::ConfResult result =
+            conf->set("default_topic_conf", topic_conf.get(), errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set default_topic_conf:", errstr);
+        }
+
+        result = conf->set("bootstrap.servers", dataset()->servers_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set bootstrap.servers ",
+                                  dataset()->servers_, ":", errstr);
+        }
+        result = conf->set("group.id", dataset()->group_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set group.id ", dataset()->group_,
+                                  ":", errstr);
+        }
+
+        consumer_.reset(RdKafka::KafkaConsumer::create(conf.get(), errstr));
+        if (!consumer_.get()) {
+          return errors::Internal("Failed to create consumer:", errstr);
+        }
+
+        std::vector<RdKafka::TopicPartition*> partitions;
+        partitions.emplace_back(topic_partition_.get());
+        RdKafka::ErrorCode err = consumer_->assign(partitions);
+        if (err != RdKafka::ERR_NO_ERROR) {
+          return errors::Internal(
+              "Failed to assign partition [", topic_partition_->topic(), ", ",
+              topic_partition_->partition(), ", ", topic_partition_->offset(),
+              "]:", RdKafka::err2str(err));
+        }
+
+        return Status::OK();
+      }
+
+      // Resets all Kafka streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        consumer_->unassign();
+        consumer_->close();
+        consumer_.reset(nullptr);
+      }
+
+      mutex mu_;
+      size_t current_topic_index_ GUARDED_BY(mu_) = 0;
+      int64 offset_ GUARDED_BY(mu_) = 0;
+      int64 limit_ GUARDED_BY(mu_) = -1;
+      std::unique_ptr<RdKafka::TopicPartition> topic_partition_ GUARDED_BY(mu_);
+      std::unique_ptr<RdKafka::KafkaConsumer> consumer_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> topics_;
+    const std::string servers_;
+    const std::string group_;
+    const bool eof_;
+    const int64 timeout_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("KafkaDataset").Device(DEVICE_CPU),
+                        KafkaDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/kafka_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cdf16103bab2b22d51c144d21a589e1e39f2f0b
--- /dev/null
+++ b/tensorflow/contrib/kafka/ops/kafka_ops.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KafkaDataset")
+    .Input("topics: string")
+    .Input("servers: string")
+    .Input("group: string")
+    .Input("eof: bool")
+    .Input("timeout: int64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the messages of one or more Kafka topics.
+
+topics: A `tf.string` tensor containing one or more subscriptions,
+  in the format of [topic:partition:offset:length],
+  by default length is -1 for unlimited.
+servers: A list of bootstrap servers.
+group: The consumer group id.
+eof: If True, the kafka reader will stop on EOF.
+timeout: The timeout value for the Kafka Consumer to wait
+  (in millisecond).
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..94cf6b5ace6a10b7c0471d7d25b5bce789ac322b
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
@@ -0,0 +1,117 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for KafkaDataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from tensorflow.contrib.kafka.python.ops import kafka_dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+class KafkaDatasetTest(test.TestCase):
+
+  def setUp(self):
+    # The Kafka server has to be setup before the test
+    # and tear down after the test manually.
+    # The docker engine has to be installed.
+    #
+    # To setup the Kafka server:
+    # $ bash kafka_test.sh start kafka
+    #
+    # To team down the Kafka server:
+    # $ bash kafka_test.sh stop kafka
+    pass
+
+  def testKafkaDataset(self):
+    topics = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = kafka_dataset_ops.KafkaDataset(
+        topics, group="test", eof=True).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from topic 0.
+      sess.run(
+          init_op, feed_dict={topics: ["test:0:0:4"],
+                              num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D"+str(i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from topic 1.
+      sess.run(
+          init_op, feed_dict={topics: ["test:0:5:-1"],
+                              num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D"+str(i + 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both topics.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                                   num_epochs: 1})
+      for j in range(2):
+        for i in range(5):
+          self.assertEqual("D"+str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                                   num_epochs: 10})
+      for _ in range(10):
+        for j in range(2):
+          for i in range(5):
+            self.assertEqual("D"+str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(
+          init_batch_op,
+          feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                     num_epochs: 10,
+                     batch_size: 5})
+      for _ in range(10):
+        self.assertAllEqual(["D"+str(i) for i in range(5)],
+                            sess.run(get_next))
+        self.assertAllEqual(["D"+str(i + 5) for i in range(5)],
+                            sess.run(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7997c12731189e56fc491a0f4de8b19c39d262b2
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+set -e
+set -o pipefail
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+container=$2
+if [ "$1" == "start" ]; then
+    docker run -d --rm --net=host --name=$container spotify/kafka
+    echo Wait 5 secs until kafka is up and running
+    sleep 5
+    echo Create test topic
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test'
+    echo Create test message
+    docker exec $container bash -c 'echo -e "D0\nD1\nD2\nD3\nD4\nD5\nD6\nD7\nD8\nD9" > /test'
+    echo Produce test message
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-console-producer.sh --topic test --broker-list 127.0.0.1:9092 < /test'
+
+    echo Container $container started successfully
+elif [ "$1" == "stop" ]; then
+    docker rm -f $container
+
+    echo Container $container stopped successfully
+else
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+
+
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e561f595a405280010a54d761bdb378ec0162ac0
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops import gen_kafka_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.data.ops.readers import Dataset
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import resource_loader
+
+class KafkaDataset(Dataset):
+  """A Kafka Dataset that consumes the message.
+  """
+
+  def __init__(
+      self, topics, servers="localhost", group="", eof=False, timeout=1000):
+    """Create a KafkaReader.
+
+    Args:
+      topics: A `tf.string` tensor containing one or more subscriptions,
+              in the format of [topic:partition:offset:length],
+              by default length is -1 for unlimited.
+      servers: A list of bootstrap servers.
+      group: The consumer group id.
+      eof: If True, the kafka reader will stop on EOF.
+      timeout: The timeout value for the Kafka Consumer to wait
+               (in millisecond).
+    """
+    super(KafkaDataset, self).__init__()
+    self._topics = ops.convert_to_tensor(
+        topics, dtype=dtypes.string, name="topics")
+    self._servers = ops.convert_to_tensor(
+        servers, dtype=dtypes.string, name="servers")
+    self._group = ops.convert_to_tensor(
+        group, dtype=dtypes.string, name="group")
+    self._eof = ops.convert_to_tensor(
+        eof, dtype=dtypes.bool, name="eof")
+    self._timeout = ops.convert_to_tensor(
+        timeout, dtype=dtypes.int64, name="timeout")
+
+  def _as_variant_tensor(self):
+    return gen_kafka_ops.kafka_dataset(
+        self._topics, self._servers, self._group, self._eof, self._timeout)
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
diff --git a/tensorflow/contrib/kernel_methods/python/losses_test.py b/tensorflow/contrib/kernel_methods/python/losses_test.py
index d38d8041ce1216dfb5af6e93984b35e71008610a..72507539f813d14064bc58f03b6db4781abc9438 100644
--- a/tensorflow/contrib/kernel_methods/python/losses_test.py
+++ b/tensorflow/contrib/kernel_methods/python/losses_test.py
@@ -119,19 +119,20 @@ class SparseMulticlassHingeLossTest(test.TestCase):
 
   def testUnknownShape(self):
     """Result keeps same with `testZeroLossInt32Labels`"""
-    logits_np = np.array([[1.2, -1.4, -1.0],
-                          [1.4, 1.8, 4.0],
-                          [0.5, 1.8, -1.0]])
+    logits_np = np.array([[1.2, -1.4, -1.0], [1.4, 1.8, 4.0], [0.5, 1.8, -1.0]])
     labels_np = np.array([0, 2, 1], dtype=np.int32)
 
-    logits_shapes = [[3, 3],  # batch_size, num_classes
-                     [None, 3],
-                     [3, None],
-                     [None, None]]
+    logits_shapes = [
+        [3, 3],  # batch_size, num_classes
+        [None, 3],
+        [3, None],
+        [None, None]
+    ]
 
     for batch_size, num_classes in logits_shapes:
       with self.test_session():
-        logits = array_ops.placeholder(dtypes.float32, shape=(batch_size, num_classes))
+        logits = array_ops.placeholder(
+            dtypes.float32, shape=(batch_size, num_classes))
         labels = array_ops.placeholder(dtypes.int32, shape=(batch_size,))
         loss = losses.sparse_multiclass_hinge_loss(labels, logits)
         result = loss.eval(feed_dict={logits: logits_np, labels: labels_np})
diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py
index 0f0dbb53f45dfefe69aaa9e25caf6ba0a3cf449e..87eed03888c894a04c0521d1ce5ee8975b60776b 100644
--- a/tensorflow/contrib/kfac/examples/mlp.py
+++ b/tensorflow/contrib/kfac/examples/mlp.py
@@ -317,7 +317,10 @@ def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
     return tf.estimator.EstimatorSpec(
         mode=mode, loss=loss, train_op=train_op, training_hooks=hooks)
 
+  run_config = tf.estimator.RunConfig(
+      model_dir="/tmp/mnist", save_checkpoints_steps=1, keep_checkpoint_max=100)
+
   # Train until input_fn() is empty with Estimator. This is a prerequisite for
   # TPU compatibility.
-  estimator = tf.estimator.Estimator(model_fn=model_fn)
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
   estimator.train(input_fn=input_fn)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index 9436caf9618bc3d3c0dd7b3842420016b119464f..0d2fa706f5853570bb8c04a9b9ac3378e2f2386e 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -457,7 +457,9 @@ class ConvDiagonalFB(FisherBlock):
     self._num_locations = (
         inputs_shape[1] * inputs_shape[2] //
         (self._strides[1] * self._strides[2]))
-    self._damping = normalize_damping(damping, self._num_locations)
+
+    self._damping = (self._num_locations
+                     * normalize_damping(damping, self._num_locations))
 
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvDiagonalFactor,
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index f59168cbc05fffd104ff5a44308eefd206beb9db..bcba18ae147c6ceca50bc9a2a17e01fc201d88c1 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -111,6 +112,54 @@ def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: di
   return array_ops.ones(shape, dtype)
 
 
+def extract_image_patches(image, ksizes, strides, padding, name=None):
+  """Extracts image patches for an N-dimensional convolution.
+
+  This function is a compatibility wrapper over tf.extract_image_patches(), as
+  ExtractImagePatches isn't yet implemented in XLA.
+
+  Args:
+    image: Tensor of shape [batch, in_x, in_y, ..., in_channels]. Input images.
+      All dimensions except 'batch' must be defined.
+    ksizes: [filter_x, filter_y, ...]. Spatial shape of filter in each
+      dimension.
+    strides: [stride_x, stride_y, ...]. Spatial stride for filter in each
+      dimension.
+    padding: str. "VALID" or "SAME".
+    name: str or None. name of Op.
+
+  Returns:
+    result: [batch, out_x, out_y, ..., filter_x, filter_y, ..., in_channels].
+      Contains image patches to which conv kernel would be applied for each
+      output location. [out_x, out_y, ...] depends on padding.
+  """
+  if not utils.on_tpu():
+    return array_ops.extract_image_patches(
+        image,
+        ksizes=([1] + list(ksizes) + [1]),
+        strides=([1] + list(strides) + [1]),
+        rates=[1, 1, 1, 1],
+        padding=padding,
+        name=name)
+
+  with tf_ops.name_scope(name, "extract_image_patches",
+                         [image, ksizes, strides, padding]):
+    batch = image.shape.as_list()[0]
+    in_channels = image.shape.as_list()[-1]
+
+    # Map each input feature to a location in the output.
+    out_channels = np.prod(ksizes) * in_channels
+    filters = linalg_ops.eye(out_channels),
+    filters = array_ops.reshape(filters, ksizes + [in_channels, out_channels])
+
+    result = nn.convolution(image, filters, padding, strides=strides)
+    out_spatial = result.shape.as_list()[1:-1]
+    result = array_ops.reshape(
+        result, [batch or -1] + out_spatial + ksizes + [in_channels])
+
+    return result
+
+
 def compute_cov(tensor, tensor_right=None, normalizer=None):
   """Compute the empirical second moment of the rows of a 2D Tensor.
 
@@ -668,11 +717,10 @@ class ConvDiagonalFactor(DiagonalFactor):
 
       # TODO(b/64144716): there is potential here for a big savings in terms
       # of memory use.
-      patches = array_ops.extract_image_patches(
+      patches = extract_image_patches(
           self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
           padding=self._padding)
 
       if self._has_bias:
@@ -816,11 +864,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
 
       # TODO(b/64144716): there is potential here for a big savings in terms of
       # memory use.
-      patches = array_ops.extract_image_patches(
+      patches = extract_image_patches(
           self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
           padding=self._padding)
 
       flatten_size = (filter_height * filter_width * in_channels)
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
index cc48e3c69f24c2abd343e2e120d3589cd323fcdc..fe8e39c212c2c3381f9aa6fdb9fdf423ff958481 100644
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py
@@ -24,6 +24,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
+    "set_global_constants",
     "SequenceDict",
     "tensors_to_column",
     "column_to_tensors",
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index 932c5ab99249feda1e3a7f2d707ce4237fe7177f..01893d60615a9b4ded2afc88c6de0168d4be0921 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -423,8 +423,9 @@ class SparseFeatureCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index 2eaea231776bd2f5fb8bb4bd422074beacd61720..fc8f153fe3abdc83aca5abfa9a4bb5f5d5531480 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -221,8 +221,8 @@ class FeatureColumnTest(test.TestCase):
     weighted_sparse_col = fc.weighted_sparse_column(ids, "weights")
     self.assertEqual(weighted_sparse_col.name, "ids_weighted_by_weights")
 
-    b = fc.shared_embedding_columns([sparse_col, weighted_sparse_col],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [sparse_col, weighted_sparse_col], dimension=4, combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(b[0].shared_embedding_name,
                      "a1_ids_weighted_by_weights_shared_embedding")
@@ -230,8 +230,8 @@ class FeatureColumnTest(test.TestCase):
                      "a1_ids_weighted_by_weights_shared_embedding")
 
     # Tries reversing order to check compatibility condition.
-    b = fc.shared_embedding_columns([weighted_sparse_col, sparse_col],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [weighted_sparse_col, sparse_col], dimension=4, combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(b[0].shared_embedding_name,
                      "a1_ids_weighted_by_weights_shared_embedding")
@@ -240,18 +240,17 @@ class FeatureColumnTest(test.TestCase):
 
     # Tries adding two weighted columns to check compatibility between them.
     weighted_sparse_col_2 = fc.weighted_sparse_column(ids, "weights_2")
-    b = fc.shared_embedding_columns([weighted_sparse_col,
-                                     weighted_sparse_col_2],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [weighted_sparse_col, weighted_sparse_col_2],
+        dimension=4,
+        combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(
         b[0].shared_embedding_name,
-        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
-    )
+        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding")
     self.assertEqual(
         b[1].shared_embedding_name,
-        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
-    )
+        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding")
 
   def testSharedEmbeddingColumnDeterminism(self):
     # Tests determinism in auto-generated shared_embedding_name.
@@ -286,10 +285,10 @@ class FeatureColumnTest(test.TestCase):
     columns = fc.shared_embedding_columns(
         [a1, a2], dimension=4, combiner="mean")
     columns_copy = copy.deepcopy(columns)
-    self.assertEqual(
-        columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding")
-    self.assertEqual(
-        columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
+    self.assertEqual(columns_copy[0].shared_embedding_name,
+                     "a1_a2_shared_embedding")
+    self.assertEqual(columns_copy[1].shared_embedding_name,
+                     "a1_a2_shared_embedding")
 
   def testOneHotColumn(self):
     a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
@@ -336,11 +335,11 @@ class FeatureColumnTest(test.TestCase):
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     one_hot = fc.one_hot_column(weighted_ids)
     features = {
-        'ids': constant_op.constant([['marlo', 'unknown', 'omar']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
+        "ids": constant_op.constant([["marlo", "unknown", "omar"]]),
+        "weights": constant_op.constant([[2., 4., 6.]])
     }
     one_hot_tensor = feature_column_ops.input_from_feature_columns(
-      features, [one_hot])
+        features, [one_hot])
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
@@ -349,11 +348,9 @@ class FeatureColumnTest(test.TestCase):
   def testMissingValueInOneHotColumnForSparseColumnWithKeys(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     one_hot = fc.one_hot_column(ids)
-    features = {
-      'ids': constant_op.constant([['marlo', 'unknown', 'omar']])
-    }
+    features = {"ids": constant_op.constant([["marlo", "unknown", "omar"]])}
     one_hot_tensor = feature_column_ops.input_from_feature_columns(
-      features, [one_hot])
+        features, [one_hot])
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
@@ -379,8 +376,7 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(d4.default_value, None)
     self.assertEqual(d4.is_sparse, True)
     # Default value is a list but dimension is None.
-    with self.assertRaisesRegexp(ValueError,
-                                 "Only scalar default value.*"):
+    with self.assertRaisesRegexp(ValueError, "Only scalar default value.*"):
       fc._real_valued_var_len_column("g5", default_value=[2., 3.])
 
   def testRealValuedVarLenColumnDtypes(self):
@@ -390,18 +386,19 @@ class FeatureColumnTest(test.TestCase):
             "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
         }, rvc.config)
 
-    rvc = fc._real_valued_var_len_column("rvc", default_value=0,
-                                         is_sparse=False)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenSequenceFeature(shape=[],
-                                                       dtype=dtypes.float32,
-                                                       allow_missing=True,
-                                                       default_value=0.0)
-        }, rvc.config)
-
-    rvc = fc._real_valued_var_len_column("rvc", dtype=dtypes.int32,
-                                         default_value=0, is_sparse=True)
+    rvc = fc._real_valued_var_len_column(
+        "rvc", default_value=0, is_sparse=False)
+    self.assertDictEqual({
+        "rvc":
+            parsing_ops.FixedLenSequenceFeature(
+                shape=[],
+                dtype=dtypes.float32,
+                allow_missing=True,
+                default_value=0.0)
+    }, rvc.config)
+
+    rvc = fc._real_valued_var_len_column(
+        "rvc", dtype=dtypes.int32, default_value=0, is_sparse=True)
     self.assertDictEqual(
         {
             "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
@@ -409,8 +406,8 @@ class FeatureColumnTest(test.TestCase):
 
     with self.assertRaisesRegexp(TypeError,
                                  "dtype must be convertible to float"):
-      fc._real_valued_var_len_column("rvc", dtype=dtypes.string,
-                                     default_value="", is_sparse=True)
+      fc._real_valued_var_len_column(
+          "rvc", dtype=dtypes.string, default_value="", is_sparse=True)
 
   def testRealValuedColumn(self):
     a = fc.real_valued_column("aaa")
@@ -504,13 +501,13 @@ class FeatureColumnTest(test.TestCase):
     for output_rank in range(1, 3 + len(dimensions)):
       with variable_scope.variable_scope("output_rank_{}".format(output_rank)):
         real_valued_output = real_valued_column._to_dnn_input_layer(
-            constant_op.constant(
-                real_valued_input, dtype=dtypes.float32),
+            constant_op.constant(real_valued_input, dtype=dtypes.float32),
             output_rank=output_rank)
       with self.test_session() as sess:
         real_valued_eval = sess.run(real_valued_output)
-      expected_shape = (input_shape[:output_rank - 1] +
-                        [np.prod(input_shape[output_rank - 1:])])
+      expected_shape = (
+          input_shape[:output_rank - 1] +
+          [np.prod(input_shape[output_rank - 1:])])
       self.assertEquals(expected_shape, list(real_valued_eval.shape))
 
   def testRealValuedColumnDensification(self):
@@ -520,8 +517,7 @@ class FeatureColumnTest(test.TestCase):
         "sparse_real_valued1", is_sparse=True)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
-    with self.assertRaisesRegexp(
-        ValueError, "Set is_sparse to False"):
+    with self.assertRaisesRegexp(ValueError, "Set is_sparse to False"):
       real_valued_column._to_dnn_input_layer(sparse_tensor)
 
   def testRealValuedColumnDeepCopy(self):
@@ -549,9 +545,8 @@ class FeatureColumnTest(test.TestCase):
   def testBucketizedColumnRequiresRealValuedColumnDimension(self):
     with self.assertRaisesRegexp(
         TypeError, "source_column must be an instance of _RealValuedColumn.*"):
-      fc.bucketized_column(fc._real_valued_var_len_column("bbb",
-                                                          is_sparse=True),
-                           [0])
+      fc.bucketized_column(
+          fc._real_valued_var_len_column("bbb", is_sparse=True), [0])
 
   def testBucketizedColumnRequiresSortedBuckets(self):
     with self.assertRaisesRegexp(ValueError,
@@ -654,20 +649,14 @@ class FeatureColumnTest(test.TestCase):
 
   def testRealValuedColumnDtypes(self):
     rvc = fc.real_valued_column("rvc")
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32)
-        },
-        rvc.config)
+    self.assertDictEqual({
+        "rvc": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32)
+    }, rvc.config)
 
     rvc = fc.real_valued_column("rvc", dtype=dtypes.int32)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.int32)
-        },
-        rvc.config)
+    self.assertDictEqual({
+        "rvc": parsing_ops.FixedLenFeature([1], dtype=dtypes.int32)
+    }, rvc.config)
 
     with self.assertRaisesRegexp(ValueError,
                                  "dtype must be convertible to float"):
@@ -702,8 +691,9 @@ class FeatureColumnTest(test.TestCase):
     batch_size = 4
     dense_scalar_input = [1, 2, 3, 4]
     sparse_column = fc.sparse_column_with_integerized_feature("values", 10)
-    features = {"values":
-                constant_op.constant(dense_scalar_input, dtype=dtypes.int64)}
+    features = {
+        "values": constant_op.constant(dense_scalar_input, dtype=dtypes.int64)
+    }
     sparse_column.insert_transformed_feature(features)
     sparse_output = features[sparse_column]
     expected_shape = [batch_size, 1]
@@ -731,8 +721,7 @@ class FeatureColumnTest(test.TestCase):
 
   def testSparseColumnKeysDeepCopy(self):
     """Tests deepcopy of sparse_column_with_keys."""
-    column = fc.sparse_column_with_keys(
-        "a", keys=["key0", "key1", "key2"])
+    column = fc.sparse_column_with_keys("a", keys=["key0", "key1", "key2"])
     self.assertEqual("a", column.name)
     column_copy = copy.deepcopy(column)
     self.assertEqual("a", column_copy.name)
@@ -785,8 +774,9 @@ class FeatureColumnTest(test.TestCase):
     a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
     b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
     cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
-    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
-        "sparse_column_for_one_hot", hash_bucket_size=100))
+    one_hot_col = fc.one_hot_column(
+        fc.sparse_column_with_hash_bucket(
+            "sparse_column_for_one_hot", hash_bucket_size=100))
     scattered_embedding_col = fc.scattered_embedding_column(
         "scattered_embedding_column", size=100, dimension=10, hash_key=1)
     feature_columns = set([
@@ -809,17 +799,13 @@ class FeatureColumnTest(test.TestCase):
         "str_id_weights_column":
             parsing_ops.VarLenFeature(dtypes.float32),
         "real_valued_column1":
-            parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
         "real_valued_column2":
-            parsing_ops.FixedLenFeature(
-                [5], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
         "real_valued_column_for_bucketization1":
-            parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
         "real_valued_column_for_bucketization2":
-            parsing_ops.FixedLenFeature(
-                [4], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
         "cross_aaa":
             parsing_ops.VarLenFeature(dtypes.string),
         "cross_bbb":
@@ -849,11 +835,14 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col0 = fc._real_valued_var_len_column(
         "real_valued_column0", is_sparse=True)
     real_valued_col1 = fc._real_valued_var_len_column(
-        "real_valued_column1", dtype=dtypes.int64, default_value=0,
+        "real_valued_column1",
+        dtype=dtypes.int64,
+        default_value=0,
         is_sparse=False)
     feature_columns = set([real_valued_col0, real_valued_col1])
     expected_config = {
-        "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_column0":
+            parsing_ops.VarLenFeature(dtype=dtypes.float32),
         "real_valued_column1":
             parsing_ops.FixedLenSequenceFeature(
                 [], dtype=dtypes.int64, allow_missing=True, default_value=0),
@@ -874,7 +863,9 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col5 = fc._real_valued_var_len_column(
         "real_valued_column5", default_value=2, is_sparse=True)
     real_valued_col6 = fc._real_valued_var_len_column(
-        "real_valued_column6", dtype=dtypes.int64, default_value=1,
+        "real_valued_column6",
+        dtype=dtypes.int64,
+        default_value=1,
         is_sparse=False)
     feature_columns = [
         real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4,
@@ -902,8 +893,7 @@ class FeatureColumnTest(test.TestCase):
                 parsing_ops.VarLenFeature(dtype=dtypes.float32),
             "real_valued_column6":
                 parsing_ops.FixedLenSequenceFeature(
-                    [], dtype=dtypes.int64, allow_missing=True,
-                    default_value=1)
+                    [], dtype=dtypes.int64, allow_missing=True, default_value=1)
         },
         config)
 
@@ -1104,8 +1094,8 @@ class FeatureColumnTest(test.TestCase):
       # This will initialize the crossed column weights from provided checkpoint
       # and return a [4, 1] tensor which is same as weights variable. Since we
       # won't modify weights, this should be same as 'saved_col_weights'.
-      _, col_weights, _ = (feature_column_ops.weighted_sum_from_feature_columns(
-          {
+      _, col_weights, _ = (
+          feature_column_ops.weighted_sum_from_feature_columns({
               sparse_col_1.name: input_tensor,
               sparse_col_2.name: input_tensor
           }, [crossed_col_initialized], 1))
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index c8e3307ee8b5ded30dc864c4e69452f58685b8f0..fb7b2e315efb773770eda8c07e52c4850e48e4da 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -60,12 +60,12 @@ __all__ = [
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
     'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
-           'dropout', 'elu', 'flatten',
-    'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear', 'pool',
-    'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
-    'scale_gradient', 'separable_conv2d', 'separable_convolution2d', 'softmax',
-    'spatial_softmax', 'stack', 'unit_norm', 'legacy_fully_connected',
-    'legacy_linear', 'legacy_relu', 'maxout'
+    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'layer_norm',
+    'linear', 'pool', 'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu',
+    'relu6', 'repeat', 'scale_gradient', 'separable_conv2d',
+    'separable_convolution2d', 'softmax', 'spatial_softmax', 'stack',
+    'unit_norm', 'legacy_fully_connected', 'legacy_linear', 'legacy_relu',
+    'maxout'
 ]
 
 DATA_FORMAT_NCHW = 'NCHW'
@@ -1418,7 +1418,9 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   with variable_scope.variable_scope(
       scope, 'dense_to_sparse', [tensor]) as sc:
     tensor = ops.convert_to_tensor(tensor)
-    indices = array_ops.where(math_ops.not_equal(tensor, constant_op.constant(eos_token, tensor.dtype)))
+    indices = array_ops.where(
+        math_ops.not_equal(
+            tensor, constant_op.constant(eos_token, tensor.dtype)))
     values = array_ops.gather_nd(tensor, indices)
     shape = array_ops.shape(tensor, out_type=dtypes.int64)
     outputs = sparse_tensor.SparseTensor(indices, values, shape)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index c5790c76221848524a106f1a218922f4e7a0b7e6..8945690db8ee233e61645c38e6e4d615c4f0da66 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -127,8 +127,8 @@ class AvgPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateAvgPool(self):
@@ -148,7 +148,8 @@ class AvgPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.avg_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['AvgPool3D'])
     self.assertEqual(output_collected, output)
@@ -183,7 +184,8 @@ class AvgPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalAvgPool(self):
     depth, height, width = 3, 6, 9
@@ -515,7 +517,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3])
         net = layers_lib.convolution2d(net, 32, [3, 3])
       self.assertEqual(len(variables.get_variables()), 8)
@@ -529,7 +533,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3], scope='Conv')
         net = layers_lib.convolution2d(
             net, 32, [3, 3], scope='Conv', reuse=True)
@@ -1031,7 +1037,8 @@ class Convolution2dTransposeTests(test.TestCase):
     for _ in range(10):
       num_filters = 1
       input_size = [
-          1, np.random.randint(1, max_image_size),
+          1,
+          np.random.randint(1, max_image_size),
           np.random.randint(1, max_image_size), 1
       ]
       filter_size = [
@@ -1185,8 +1192,10 @@ class ConvolutionInPlaneTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(init_op)
-      result = sess.run(horz_gradients,
-                        feed_dict={image: np.ones((1, 10, 10, 1))})
+      result = sess.run(
+          horz_gradients, feed_dict={
+              image: np.ones((1, 10, 10, 1))
+          })
       expected = np.zeros((1, 10, 9, 1))
 
       self.assertAllEqual(result, expected)
@@ -1299,7 +1308,8 @@ class DenseToSparseTest(test.TestCase):
     expected_constant = np.reshape(np.arange(24, dtype=np.int64), (3, 4, 2))
     tensor = constant_op.constant(expected_constant)
     sparse = _layers.dense_to_sparse(tensor)
-    dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape, sparse.values)
+    dense = sparse_ops.sparse_to_dense(
+        sparse.indices, sparse.dense_shape, sparse.values)
     with self.test_session() as sess:
       constant = sess.run(dense)
       self.assertAllEqual(expected_constant, constant)
@@ -1418,8 +1428,7 @@ class FlattenTest(test.TestCase):
     with ops.Graph().as_default() as g, self.test_session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5,)))
-      with self.assertRaisesRegexp(ValueError,
-                                   'incompatible with the layer'):
+      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
         _layers.flatten(inputs)
 
   def testUnknownLastDim(self):
@@ -1729,7 +1738,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27)
         net = _layers.fully_connected(net, 27)
       self.assertEqual(len(variables.get_variables()), 8)
@@ -1745,7 +1756,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27, scope='fc1')
         net = _layers.fully_connected(net, 27, scope='fc1', reuse=True)
       self.assertEqual(len(variables.get_variables()), 4)
@@ -1762,8 +1775,8 @@ class BatchNormTest(test.TestCase):
   def testBatchNormCenterFalse(self):
     a = array_ops.placeholder(dtype=dtypes.float32, shape=(10, 10, 10, 10))
     # Test that center=False builds a valid graph.
-    _layers.batch_norm(a, center=False, data_format='NCHW',
-                       zero_debias_moving_mean=True)
+    _layers.batch_norm(
+        a, center=False, data_format='NCHW', zero_debias_moving_mean=True)
 
   def testUnknownShape(self):
     with ops.Graph().as_default() as g, self.test_session(g):
@@ -1800,8 +1813,8 @@ class BatchNormTest(test.TestCase):
       images = np.random.uniform(size=(5, height, width, 3)).astype(
           dtype.as_numpy_dtype)
       output = _layers.batch_norm(images, fused=fused)
-      expected_name = ('BatchNorm/FusedBatchNorm' if fused else
-                       'BatchNorm/batchnorm')
+      expected_name = ('BatchNorm/FusedBatchNorm'
+                       if fused else 'BatchNorm/batchnorm')
       self.assertTrue(output.op.name.startswith(expected_name))
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
       self.assertEqual(
@@ -2020,8 +2033,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2540,8 +2553,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2571,8 +2584,9 @@ class BatchNormTest(test.TestCase):
         np_output, new_images_gradients = sess.run([output, images_gradients])
         # The outputs should be close to 0.0 mean and 1.0 variance
         self.assertAllClose(
-            np.mean(
-                np_output, axis=axis), [0] * channels, rtol=0.001, atol=0.001)
+            np.mean(np_output, axis=axis), [0] * channels,
+            rtol=0.001,
+            atol=0.001)
         self.assertAllClose(
             np.var(np_output, axis=axis), [1] * channels, rtol=0.01, atol=0.01)
         # The gradients should change slowly while updating moving_mean.
@@ -2600,14 +2614,14 @@ class BatchNormTest(test.TestCase):
     channels = 3
     with self.test_session() as sess:
       images = (np.ones((5, height, width, channels)) * 9.0).astype('f')
-      beta = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      gamma = init_ops.constant_initializer((np.ones(channels) * 2.0).astype(
-          'f'))
-      mean = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      variance = init_ops.constant_initializer((np.ones(channels) * 4.0).astype(
-          'f'))
+      beta = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      gamma = init_ops.constant_initializer(
+          (np.ones(channels) * 2.0).astype('f'))
+      mean = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      variance = init_ops.constant_initializer(
+          (np.ones(channels) * 4.0).astype('f'))
       output = _layers.batch_norm(
           images,
           is_training=False,
@@ -2628,21 +2642,18 @@ class BatchNormTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       images = np.arange(np.product(shape), dtype=np.float32).reshape(shape)
       beta = init_ops.constant_initializer(
-          np.arange(
-              2, channels + 2, dtype=np.float32))
+          np.arange(2, channels + 2, dtype=np.float32))
       gamma = init_ops.constant_initializer(
-          np.arange(
-              10, channels + 10, dtype=np.float32) * 2.0)
+          np.arange(10, channels + 10, dtype=np.float32) * 2.0)
       mean = init_ops.constant_initializer(
-          np.arange(
-              3, channels + 3, dtype=np.float32) * 5.0)
+          np.arange(3, channels + 3, dtype=np.float32) * 5.0)
       variance = init_ops.constant_initializer(
-          np.arange(
-              1, channels + 1, dtype=np.float32) * 4.0)
+          np.arange(1, channels + 1, dtype=np.float32) * 4.0)
       if data_format == 'NCHW':
         # Reshape inputs from NHWC to NCHW format.
         images = array_ops.transpose(
-            images, [0, len(shape) - 1] + list(range(1, len(shape) - 1)))
+            images, [0, len(shape) - 1] + list(range(1,
+                                                     len(shape) - 1)))
       output = _layers.batch_norm(
           images,
           is_training=is_training,
@@ -2745,16 +2756,16 @@ class BatchNormTest(test.TestCase):
     # Tests that the adjustment is appropriately passed to and used by the core
     # BN layer.
     all_adjustments = []
+
     def _create_adjustment(shape):
       adjustments = [array_ops.ones(shape[-1:]), array_ops.zeros(shape[-1:])]
       all_adjustments.extend(adjustments)
       return adjustments
+
     depth = 8
     images = array_ops.zeros([10, 5, 5, depth])
     output = _layers.batch_norm(
-        images,
-        is_training=True,
-        adjustment=_create_adjustment)
+        images, is_training=True, adjustment=_create_adjustment)
     self.assertListEqual(output.shape.as_list(), images.shape.as_list())
     self.assertEqual(len(all_adjustments), 2)
     self.assertListEqual(all_adjustments[0].shape.as_list(), [depth])
@@ -2819,7 +2830,10 @@ class LayerNormTest(test.TestCase):
       # output_train and output_eval should be the same.
       self.assertAllClose(sess.run([output_train]), sess.run([output_eval]))
 
-  def doOutputTest(self, input_shape, tol=1e-5, begin_norm_axis=1,
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   begin_norm_axis=1,
                    dtype=dtypes.float64):
     expected_mean = np.zeros(input_shape[:begin_norm_axis])
     expected_var = np.ones(input_shape[:begin_norm_axis])
@@ -2850,13 +2864,10 @@ class LayerNormTest(test.TestCase):
             # Layer-norm implemented in numpy
             eps = 1e-12
             expected_out = (
-                (gamma * (
-                    input_values
-                    - np.mean(input_values, axis=moments_axis, keepdims=True))
-                 / np.sqrt(
-                     eps
-                     + np.var(input_values, axis=moments_axis, keepdims=True)))
-                + beta)
+                (gamma * (input_values - np.mean(
+                    input_values, axis=moments_axis, keepdims=True)) /
+                 np.sqrt(eps + np.var(
+                     input_values, axis=moments_axis, keepdims=True))) + beta)
             self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
             self.assertAllClose(expected_var, var, atol=tol)
             # The full computation gets a bigger tolerance
@@ -2874,10 +2885,10 @@ class LayerNormTest(test.TestCase):
 
   def testOutput4DInputNormOnInnermostAxis(self):
     # Equivalent tests
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=3, tol=1e-4,
-                      dtype=dtypes.float64)
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4,
-                      dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=3, tol=1e-4, dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4, dtype=dtypes.float64)
 
   def testOutputSmallInput(self):
     self.doOutputTest((10, 10, 10, 30))
@@ -2914,7 +2925,7 @@ class GDNTest(test.TestCase):
       x = np.random.uniform(size=(1, 2, 3, 4)[:ndim])
       y = self._runGDN(x, x.shape, False, 'channels_last')
       self.assertEqual(x.shape, y.shape)
-      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testChannelsFirst(self):
     # `bias_add` doesn't support NCHW on CPU.
@@ -2923,8 +2934,7 @@ class GDNTest(test.TestCase):
         x = np.random.uniform(size=(4, 3, 2, 1)[:ndim])
         y = self._runGDN(x, x.shape, False, 'channels_first')
         self.assertEqual(x.shape, y.shape)
-        self.assertAllClose(
-            y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+        self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testWrongDims(self):
     for ndim in [1, 2, 6]:
@@ -2936,7 +2946,7 @@ class GDNTest(test.TestCase):
     x = np.random.uniform(size=(1, 2, 3, 4))
     y = self._runGDN(x, x.shape, True, 'channels_last')
     self.assertEqual(x.shape, y.shape)
-    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
 
 class MaxPool2DTest(test.TestCase):
@@ -3013,20 +3023,22 @@ class MaxPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateMaxPool(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32)
+    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3])
     self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
 
   def testCreateMaxPoolNCDHW(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32)
+    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW')
     self.assertEquals(output.op.name, 'MaxPool3D/transpose_1')
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4])
@@ -3034,7 +3046,8 @@ class MaxPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.max_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['MaxPool3D'])
     self.assertEqual(output_collected, output)
@@ -3069,7 +3082,8 @@ class MaxPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalMaxPool(self):
     depth, height, width = 3, 6, 9
@@ -3481,8 +3495,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[3] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[3] * 2))
 
   def testSpatialSoftmaxShapeNCHW(self):
     batch_shape = (2, 2, 35, 35)
@@ -3493,8 +3506,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[1] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[1] * 2))
 
   def testTwoMaxActivationsSameChannel(self):
     batch_size, height, width, nchannels = (2, 35, 35, 1)
@@ -3513,8 +3525,8 @@ class SpatialSoftmaxTests(test.TestCase):
     x_loc = [avg_x]
     y_loc = [avg_y]
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3532,13 +3544,13 @@ class SpatialSoftmaxTests(test.TestCase):
     spatial_softmax = _layers.spatial_softmax(features)
     np_features = np.zeros(batch_shape, dtype=np.float32)
 
-    edges = [(0, 0), (0, width-1), (height-1, 0), (height-1, width-1)]
+    edges = [(0, 0), (0, width - 1), (height - 1, 0), (height - 1, width - 1)]
     x_loc, y_loc = zip(*edges)
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3567,10 +3579,10 @@ class SpatialSoftmaxTests(test.TestCase):
       np_features1[:, x_loc[c], y_loc[c], c] = 100.
       np_features2[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints1 = self._SpatialSoftmax(
-        x_loc, y_loc, height1, width1, batch_size, nchannels)
-    np_keypoints2 = self._SpatialSoftmax(
-        x_loc, y_loc, height2, width2, batch_size, nchannels)
+    np_keypoints1 = self._SpatialSoftmax(x_loc, y_loc, height1, width1,
+                                         batch_size, nchannels)
+    np_keypoints2 = self._SpatialSoftmax(x_loc, y_loc, height2, width2,
+                                         batch_size, nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3596,8 +3608,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3619,8 +3631,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, c, x_loc[c], y_loc[c]] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3715,8 +3727,7 @@ class UnitNormTests(test.TestCase):
       image = random_ops.random_uniform((height, width, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       shape = [height, width, 3]
       del shape[dim]
@@ -3752,8 +3763,7 @@ class UnitNormTests(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, (None, None, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       with self.test_session():
         actual = norms.eval({image: placeholder_value})
@@ -3817,8 +3827,8 @@ class PoincareNormalizeTest(test.TestCase):
       with self.test_session():
         x_tf = constant_op.constant(x_np, name='x')
         y_tf = _layers.poincare_normalize(x_tf, dim)
-        err = gradient_checker.compute_gradient_error(x_tf, x_shape,
-                                                      y_tf, x_shape)
+        err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
+                                                      x_shape)
       print('PoinCareNormalize gradient err = %g ' % err)
       self.assertLess(err, 1e-4)
 
@@ -3830,14 +3840,9 @@ class LegacyFullyConnectedTest(test.TestCase):
     test.TestCase.setUp(self)
     random_seed.set_random_seed(1234)
     self.input = constant_op.constant([[1., 2., 3.], [-4., 15., -6.]])
-    self.input_3_dim_arr = [[[1., 1.1, 1.2],
-                             [2., 2.1, 2.2],
-                             [3., 3.1, 3.2],
-                             [4., 4.1, 4.2]],
-                            [[5., 5.1, 5.2],
-                             [6., 6.1, 6.2],
-                             [7., 7.1, 7.2],
-                             [8., 8.1, 8.2]]]
+    self.input_3_dim_arr = [[[1., 1.1, 1.2], [2., 2.1, 2.2], [3., 3.1, 3.2],
+                             [4., 4.1, 4.2]], [[5., 5.1, 5.2], [6., 6.1, 6.2],
+                                               [7., 7.1, 7.2], [8., 8.1, 8.2]]]
     self.input_3_dim = constant_op.constant(self.input_3_dim_arr)
 
     assert not ops.get_collection(ops.GraphKeys.SUMMARIES)
@@ -3932,15 +3937,10 @@ class LegacyFullyConnectedTest(test.TestCase):
     self._custom_initializers(self.input, 2, [[13.0, 13.0], [11.0, 11.0]])
 
   def test_custom_initializers_multi_dim(self):
-    self._custom_initializers(self.input_3_dim, 2,
-                              [[[7.6, 7.6],
-                                [13.6, 13.6],
-                                [19.6, 19.6],
-                                [25.6, 25.6]],
-                               [[31.6, 31.6],
-                                [37.6, 37.6],
-                                [43.6, 43.6],
-                                [49.6, 49.6]]])
+    self._custom_initializers(
+        self.input_3_dim, 2,
+        [[[7.6, 7.6], [13.6, 13.6], [19.6, 19.6], [25.6, 25.6]],
+         [[31.6, 31.6], [37.6, 37.6], [43.6, 43.6], [49.6, 49.6]]])
 
   def test_custom_collections(self):
     layers_lib.legacy_relu(
@@ -4050,12 +4050,16 @@ class LegacyFullyConnectedTest(test.TestCase):
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
       # we can feed in input with first dimension 2
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: self.input_3_dim_arr})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: self.input_3_dim_arr
+          })
       self.assertAllClose(shape_value, [2, 4, 1])
       # we can feed in input with first dimension 1
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: [self.input_3_dim_arr[0]]})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: [self.input_3_dim_arr[0]]
+          })
       self.assertAllClose(shape_value, [1, 4, 1])
       # we cannot feed in input with inconsistent dimensions
       with self.assertRaises(ValueError):
diff --git a/tensorflow/contrib/learn/python/learn/datasets/__init__.py b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
index a3521b4109ab40d8478f20afc317cf5154da2b43..7240b0de149051afa045a8113f9e9b212840c311 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Dataset utilities and synthetic/reference datasets."""
 
 from __future__ import absolute_import
@@ -46,11 +45,12 @@ DATASETS = {
 
 # List of all synthetic datasets
 SYNTHETIC = {
-  # All of these will return ['data', 'target'] -> base.Dataset
-  'circles': synthetic.circles,
-  'spirals': synthetic.spirals
+    # All of these will return ['data', 'target'] -> base.Dataset
+    'circles': synthetic.circles,
+    'spirals': synthetic.spirals
 }
 
+
 def load_dataset(name, size='small', test_with_fake_data=False):
   """Loads dataset by name.
 
@@ -83,23 +83,28 @@ def make_dataset(name, n_samples=100, noise=None, seed=42, *args, **kwargs):
     seed: int or None, seed for noise
 
   Returns:
-    Shuffled features and labels for given synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for given synthetic dataset of type
+    `base.Dataset`
 
   Raises:
     ValueError: Raised if `name` not found
 
   Note:
-    - This is a generic synthetic data generator - individual generators might have more parameters!
+    - This is a generic synthetic data generator - individual generators might
+    have more parameters!
       See documentation for individual parameters
-    - Note that the `noise` parameter uses `numpy.random.normal` and depends on `numpy`'s seed
+    - Note that the `noise` parameter uses `numpy.random.normal` and depends on
+    `numpy`'s seed
 
   TODO:
     - Support multiclass datasets
-    - Need shuffling routine. Currently synthetic datasets are reshuffled to avoid train/test correlation,
+    - Need shuffling routine. Currently synthetic datasets are reshuffled to
+    avoid train/test correlation,
       but that hurts reprodusability
   """
   # seed = kwargs.pop('seed', None)
   if name not in SYNTHETIC:
     raise ValueError('Synthetic dataset not found or not implemeted: %s' % name)
   else:
-    return SYNTHETIC[name](n_samples=n_samples, noise=noise, seed=seed, *args, **kwargs)
+    return SYNTHETIC[name](
+        n_samples=n_samples, noise=noise, seed=seed, *args, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 71978d439449e29c7cb907b18bab5d6659a972b6..18bf16e246bcb6c0a6a4ce75bc5c28d4e0d045e5 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Base utilities for loading datasets."""
 
 from __future__ import absolute_import
@@ -100,9 +99,7 @@ def load_iris(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'iris.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.int,
-      features_dtype=np.float)
+      data_path, target_dtype=np.int, features_dtype=np.float)
 
 
 def load_boston(data_path=None):
@@ -118,16 +115,10 @@ def load_boston(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'boston_house_prices.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.float,
-      features_dtype=np.float)
+      data_path, target_dtype=np.float, features_dtype=np.float)
 
 
-def retry(initial_delay,
-          max_delay,
-          factor=2.0,
-          jitter=0.25,
-          is_retriable=None):
+def retry(initial_delay, max_delay, factor=2.0, jitter=0.25, is_retriable=None):
   """Simple decorator for wrapping retriable functions.
 
   Args:
@@ -152,7 +143,7 @@ def retry(initial_delay,
   def delays():
     delay = initial_delay
     while delay <= max_delay:
-      yield delay * random.uniform(1 - jitter,  1 + jitter)
+      yield delay * random.uniform(1 - jitter, 1 + jitter)
       delay *= factor
 
   def wrap(fn):
@@ -172,7 +163,9 @@ def retry(initial_delay,
           else:
             raise
       return fn(*args, **kwargs)
+
     return wrapped_fn
+
   return wrap
 
 
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 1f3295747e141760445b021bf4f59cc47b88b8b2..37f9175015a239f763c7721cf36ab8063c0a3e32 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Functions for downloading and reading MNIST data."""
 
 from __future__ import absolute_import
@@ -123,8 +122,8 @@ class DataSet(object):
     numpy.random.seed(seed1 if seed is None else seed2)
     dtype = dtypes.as_dtype(dtype).base_dtype
     if dtype not in (dtypes.uint8, dtypes.float32):
-      raise TypeError('Invalid image dtype %r, expected uint8 or float32' %
-                      dtype)
+      raise TypeError(
+          'Invalid image dtype %r, expected uint8 or float32' % dtype)
     if fake_data:
       self._num_examples = 10000
       self.one_hot = one_hot
@@ -202,7 +201,9 @@ class DataSet(object):
       end = self._index_in_epoch
       images_new_part = self._images[start:end]
       labels_new_part = self._labels[start:end]
-      return numpy.concatenate((images_rest_part, images_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
+      return numpy.concatenate(
+          (images_rest_part, images_new_part), axis=0), numpy.concatenate(
+              (labels_rest_part, labels_new_part), axis=0)
     else:
       self._index_in_epoch += batch_size
       end = self._index_in_epoch
@@ -257,16 +258,14 @@ def read_data_sets(train_dir,
     test_labels = extract_labels(f, one_hot=one_hot)
 
   if not 0 <= validation_size <= len(train_images):
-    raise ValueError(
-        'Validation size should be between 0 and {}. Received: {}.'
-        .format(len(train_images), validation_size))
+    raise ValueError('Validation size should be between 0 and {}. Received: {}.'
+                     .format(len(train_images), validation_size))
 
   validation_images = train_images[:validation_size]
   validation_labels = train_labels[:validation_size]
   train_images = train_images[validation_size:]
   train_labels = train_labels[validation_size:]
 
-
   options = dict(dtype=dtype, reshape=reshape, seed=seed)
 
   train = DataSet(train_images, train_labels, **options)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
index 907dc0f3dfced7e55c5f46711fbe93f6400e1de7..9a843168c27d9cae3f55efe4fe4c688d86c745f3 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Synthetic dataset generators."""
 
 from __future__ import absolute_import
@@ -23,18 +22,27 @@ import numpy as np
 
 from tensorflow.contrib.learn.python.learn.datasets.base import Dataset
 
-def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args, **kwargs):
+
+def circles(n_samples=100,
+            noise=None,
+            seed=None,
+            factor=0.8,
+            n_classes=2,
+            *args,
+            **kwargs):
   """Create circles separated by some value
 
   Args:
     n_samples: int, number of datapoints to generate
     noise: float or None, standard deviation of the Gaussian noise added
     seed: int or None, seed for the noise
-    factor: float, size factor of the inner circles with respect to the outer ones
+    factor: float, size factor of the inner circles with respect to the outer
+      ones
     n_classes: int, number of classes to generate
 
   Returns:
-    Shuffled features and labels for 'circles' synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for 'circles' synthetic dataset of type
+    `base.Dataset`
 
   Note:
     The multi-class support might not work as expected if `noise` is enabled
@@ -54,7 +62,7 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
   if seed is not None:
     np.random.seed(seed)
   # Algo: 1) Generate initial circle, 2) For ever class generate a smaller radius circle
-  linspace = np.linspace(0, 2*np.pi, n_samples // n_classes)
+  linspace = np.linspace(0, 2 * np.pi, n_samples // n_classes)
   circ_x = np.empty(0, dtype=np.int32)
   circ_y = np.empty(0, dtype=np.int32)
   base_cos = np.cos(linspace)
@@ -66,12 +74,12 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
     circ_y = np.append(circ_y, base_sin)
     base_cos *= factor
     base_sin *= factor
-    y = np.append(y, label*np.ones(n_samples // n_classes, dtype=np.int32))
+    y = np.append(y, label * np.ones(n_samples // n_classes, dtype=np.int32))
 
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
-  circ_x = np.append(circ_x, np.cos(np.random.rand(extras)*2*np.pi))
-  circ_y = np.append(circ_y, np.sin(np.random.rand(extras)*2*np.pi))
+  circ_x = np.append(circ_x, np.cos(np.random.rand(extras) * 2 * np.pi))
+  circ_y = np.append(circ_y, np.sin(np.random.rand(extras) * 2 * np.pi))
   y = np.append(y, np.zeros(extras, dtype=np.int32))
 
   # Reshape the features/labels
@@ -85,10 +93,13 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
   return Dataset(data=X[indices], target=y[indices])
 
 
-def spirals(n_samples=100, noise=None, seed=None,
-            mode = 'archimedes',
-            n_loops = 2,
-            *args, **kwargs):
+def spirals(n_samples=100,
+            noise=None,
+            seed=None,
+            mode='archimedes',
+            n_loops=2,
+            *args,
+            **kwargs):
   """Create spirals
 
   Currently only binary classification is supported for spiral generation
@@ -104,7 +115,8 @@ def spirals(n_samples=100, noise=None, seed=None,
       'fermat': a spiral with branch distances decreasing (sqrt)
 
   Returns:
-    Shuffled features and labels for 'spirals' synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for 'spirals' synthetic dataset of type
+    `base.Dataset`
 
   Raises:
     ValueError: If the generation `mode` is not valid
@@ -112,34 +124,35 @@ def spirals(n_samples=100, noise=None, seed=None,
   TODO:
     - Generation of unbalanced data
   """
-  n_classes = 2 # I am not sure how to make it multiclass
+  n_classes = 2  # I am not sure how to make it multiclass
 
   _modes = {
-    'archimedes': _archimedes_spiral,
-    'bernoulli': _bernoulli_spiral,
-    'fermat': _fermat_spiral
+      'archimedes': _archimedes_spiral,
+      'bernoulli': _bernoulli_spiral,
+      'fermat': _fermat_spiral
   }
 
   if mode is None or mode not in _modes:
-    raise ValueError("Cannot generate spiral with mode %s"%mode)
+    raise ValueError('Cannot generate spiral with mode %s' % mode)
 
   if seed is not None:
     np.random.seed(seed)
-  linspace = np.linspace(0, 2*n_loops*np.pi, n_samples // n_classes)
+  linspace = np.linspace(0, 2 * n_loops * np.pi, n_samples // n_classes)
   spir_x = np.empty(0, dtype=np.int32)
   spir_y = np.empty(0, dtype=np.int32)
 
   y = np.empty(0, dtype=np.int32)
   for label in range(n_classes):
-    base_cos, base_sin = _modes[mode](linspace, label*np.pi, *args, **kwargs)
+    base_cos, base_sin = _modes[mode](linspace, label * np.pi, *args, **kwargs)
     spir_x = np.append(spir_x, base_cos)
     spir_y = np.append(spir_y, base_sin)
-    y = np.append(y, label*np.ones(n_samples // n_classes, dtype=np.int32))
+    y = np.append(y, label * np.ones(n_samples // n_classes, dtype=np.int32))
 
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
   if extras > 0:
-    x_exrta, y_extra = _modes[mode](np.random.rand(extras)*2*np.pi, *args, **kwargs)
+    x_extra, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
+                                    **kwargs)
     spir_x = np.append(spir_x, x_extra)
     spir_y = np.append(spir_y, y_extra)
     y = np.append(y, np.zeros(extras, dtype=np.int32))
@@ -162,7 +175,8 @@ def _archimedes_spiral(theta, theta_offset=0., *args, **kwargs):
     theta: array-like, angles from polar coordinates to be converted
     theta_offset: float, angle offset in radians (2*pi = 0)
   """
-  x, y = theta*np.cos(theta + theta_offset), theta*np.sin(theta + theta_offset)
+  x, y = theta * np.cos(theta + theta_offset), theta * np.sin(
+      theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
@@ -181,7 +195,8 @@ def _bernoulli_spiral(theta, theta_offset=0., *args, **kwargs):
   """
   exp_scale = kwargs.pop('exp_scale', 0.1)
 
-  x, y = np.exp(exp_scale*theta)*np.cos(theta + theta_offset), np.exp(exp_scale*theta)*np.sin(theta + theta_offset)
+  x, y = np.exp(exp_scale * theta) * np.cos(theta + theta_offset), np.exp(
+      exp_scale * theta) * np.sin(theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
@@ -195,7 +210,8 @@ def _fermat_spiral(theta, theta_offset=0., *args, **kwargs):
     theta: array-like, angles from polar coordinates to be converted
     theta_offset: float, angle offset in radians (2*pi = 0)
   """
-  x, y = np.sqrt(theta)*np.cos(theta + theta_offset), np.sqrt(theta)*np.sin(theta + theta_offset)
+  x, y = np.sqrt(theta) * np.cos(theta + theta_offset), np.sqrt(theta) * np.sin(
+      theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
index 5340afab46eba957d6d612bb583983b627537547..5809995c8c7d8e72eb47ee88a72547bae7fd3594 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.platform import test
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn.datasets import synthetic
 
+
 class SyntheticTest(test.TestCase):
   """Test synthetic dataset generation"""
 
   def test_make_dataset(self):
     """Test if the synthetic routine wrapper complains about the name"""
-    self.assertRaises(ValueError, datasets.make_dataset, name='_non_existing_name')
+    self.assertRaises(
+        ValueError, datasets.make_dataset, name='_non_existing_name')
 
   def test_all_datasets_callable(self):
     """Test if all methods inside the `SYNTHETIC` are callable"""
@@ -52,9 +54,10 @@ class SyntheticTest(test.TestCase):
     """
     n_samples = 100
     n_classes = 2
-    circ = synthetic.circles(n_samples = n_samples, noise = None, n_classes = n_classes)
+    circ = synthetic.circles(
+        n_samples=n_samples, noise=None, n_classes=n_classes)
     self.assertIsInstance(circ, datasets.base.Dataset)
-    self.assertTupleEqual(circ.data.shape, (n_samples,2))
+    self.assertTupleEqual(circ.data.shape, (n_samples, 2))
     self.assertTupleEqual(circ.target.shape, (n_samples,))
     self.assertSetEqual(set(circ.target), set(range(n_classes)))
 
@@ -67,17 +70,24 @@ class SyntheticTest(test.TestCase):
     """
     seed = 42
     noise = 0.1
-    circ0 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
+    circ0 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
     np.testing.assert_array_equal(circ0.data, circ1.data)
     np.testing.assert_array_equal(circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed+1)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.target, circ1.target)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed + 1)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                      circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise/2., n_classes = 2, seed = seed)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise / 2., n_classes=2, seed=seed)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
 
   def test_spirals(self):
     """Test if the circles are generated correctly
@@ -89,13 +99,14 @@ class SyntheticTest(test.TestCase):
       - returned `target` shape is (n_samples,)
       - set of unique classes range is [0, n_classes)
     """
-    self.assertRaises(ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
+    self.assertRaises(
+        ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
     n_samples = 100
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir = synthetic.spirals(n_samples = n_samples, noise = None, mode = mode)
+      spir = synthetic.spirals(n_samples=n_samples, noise=None, mode=mode)
       self.assertIsInstance(spir, datasets.base.Dataset)
-      self.assertTupleEqual(spir.data.shape, (n_samples,2))
+      self.assertTupleEqual(spir.data.shape, (n_samples, 2))
       self.assertTupleEqual(spir.target.shape, (n_samples,))
       self.assertSetEqual(set(spir.target), set(range(2)))
 
@@ -110,18 +121,24 @@ class SyntheticTest(test.TestCase):
     noise = 0.1
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir0 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
+      spir0 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
       np.testing.assert_array_equal(spir0.data, spir1.data)
       np.testing.assert_array_equal(spir0.target, spir1.target)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed+1)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.target, spir1.target)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed + 1)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.target, spir1.target)
+
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise / 2., seed=seed)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise/2., seed = seed)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
+  def test_spirals_synthetic(self):
+    synthetic.spirals(3)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
index 6b125534a42c5cdde69773d99cefd6e7b2d60c9c..b968aeed1b7a11d522b531783f04f0104b37904f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -44,7 +44,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 
-
 NUM_EXAMPLES = 100
 N_CLASSES = 5  #  Cardinality of multiclass labels.
 LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
@@ -52,8 +51,10 @@ LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
 
 def _train_test_split(features_and_labels):
   features, labels = features_and_labels
-  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
-  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
+  train_set = (features[:int(len(features) / 2)],
+               labels[:int(len(features) / 2)])
+  test_set = (features[int(len(features) / 2):],
+              labels[int(len(features) / 2):])
   return train_set, test_set
 
 
@@ -86,17 +87,17 @@ class DebugClassifierTest(test.TestCase):
     (train_features, train_labels), (test_features,
                                      test_labels) = _train_test_split(
                                          [self.features, self.labels])
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictBinary(self):
@@ -105,34 +106,34 @@ class DebugClassifierTest(test.TestCase):
                                      test_labels) = _train_test_split(
                                          [self.features, self.binary_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictProba(self):
@@ -150,8 +151,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -173,17 +174,17 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
 
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
     class_distribution = np.zeros((1, 2))
     for label in train_labels:
@@ -194,8 +195,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -232,13 +233,12 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = test_data.prepare_iris_data_for_logistic_regression()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[100], dtype=dtypes.int32)
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=_input_fn, steps=5)
     scores = classifier.evaluate(input_fn=_input_fn, steps=1)
     self.assertIn('loss', scores)
@@ -342,8 +342,7 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = base.load_iris()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[150], dtype=dtypes.int32)
 
@@ -387,7 +386,9 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1], [0], [0], [0]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     classifier = debug.DebugClassifier(n_classes=2)
@@ -404,8 +405,7 @@ class DebugClassifierTest(test.TestCase):
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -414,8 +414,7 @@ class DebugClassifierTest(test.TestCase):
       # 4 rows, with different weights.
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[7.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -438,8 +437,7 @@ class DebugClassifierTest(test.TestCase):
       # than (y=Not(x)) due to the relative higher weight of the first row.
       labels = constant_op.constant([[1], [0], [0], [0]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[100.], [3.], [2.], [2.]])
       }
       return features, labels
@@ -448,8 +446,7 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows (y = x)
       labels = constant_op.constant([[1], [1], [1], [1]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -469,8 +466,7 @@ class DebugClassifierTest(test.TestCase):
       features = {
           'x':
               input_lib.limit_epochs(
-                  array_ops.ones(
-                      shape=[4, 1], dtype=dtypes.float32),
+                  array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
                   num_epochs=num_epochs),
       }
       return features, labels
@@ -578,12 +574,11 @@ class DebugClassifierTest(test.TestCase):
     language = feature_column.sparse_column_with_hash_bucket('language', 100)
     feature_columns = [
         feature_column.real_valued_column('age'),
-        feature_column.embedding_column(
-            language, dimension=1)
+        feature_column.embedding_column(language, dimension=1)
     ]
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=input_fn, steps=5)
 
     def default_input_fn(unused_estimator, examples):
@@ -614,8 +609,8 @@ class DebugRegressorTest(test.TestCase):
     classifier.fit(
         input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_scores(input_fn=_input_fn_builder(test_features,
-                                                                None))
+    pred = classifier.predict_scores(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
   def testExperimentIntegration(self):
@@ -698,7 +693,9 @@ class DebugRegressorTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The algorithm should learn (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     regressor = debug.DebugRegressor(
@@ -853,5 +850,6 @@ class DebugRegressorTest(test.TestCase):
     predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn))
     self.assertAllClose(predictions, predictions2)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 12f9bba531a296a00d17956b8ce32e5d7dead380..2bd57597c2e9444b51b1dacfbe4180b443c95a3d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -1224,7 +1224,7 @@ class DNNRegressorTest(test.TestCase):
       self, predictions, expected_shape):
     predictions_nparray = np.array(predictions)
     self.assertAllEqual(expected_shape, predictions_nparray.shape)
-    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.float))
+    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.floating))
 
   def testPredict_AsIterableFalse(self):
     """Tests predict method with as_iterable=False."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 50c74add86fcf62c738e81426bfaf842fbac2b4e..63d0f1e1d454354948654e8ad4208a8852d356ca 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Base Estimator class."""
 
 from __future__ import absolute_import
@@ -76,7 +75,6 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-
 AS_ITERABLE_DATE = '2016-09-15'
 AS_ITERABLE_INSTRUCTIONS = (
     'The default behavior of predict() is changing. The default value for\n'
@@ -223,8 +221,11 @@ def _get_replica_device_setter(config):
 
   if config.num_ps_replicas > 0:
     return device_setter.replica_device_setter(
-        ps_tasks=config.num_ps_replicas, worker_device=worker_device,
-        merge_devices=True, ps_ops=ps_ops, cluster=config.cluster_spec)
+        ps_tasks=config.num_ps_replicas,
+        worker_device=worker_device,
+        merge_devices=True,
+        ps_ops=ps_ops,
+        cluster=config.cluster_spec)
   else:
     return None
 
@@ -284,10 +285,10 @@ def _make_metrics_ops(metrics, features, labels, predictions):
         raise ValueError('Invalid metric for {}. It returned a tuple with '
                          'len {}, expected 2.'.format(name, len(name)))
       if not isinstance(predictions, dict):
-        raise ValueError(
-            'Metrics passed provide (name, prediction), '
-            'but predictions are not dict. '
-            'Metrics: %s, Predictions: %s.' % (metrics, predictions))
+        raise ValueError('Metrics passed provide (name, prediction), '
+                         'but predictions are not dict. '
+                         'Metrics: %s, Predictions: %s.' % (metrics,
+                                                            predictions))
       # Here are two options: labels are single Tensor or a dict.
       if isinstance(labels, dict) and name[1] in labels:
         # If labels are dict and the prediction name is in it, apply metric.
@@ -298,10 +299,10 @@ def _make_metrics_ops(metrics, features, labels, predictions):
     else:
       # Single head metrics.
       if isinstance(predictions, dict):
-        raise ValueError(
-            'Metrics passed provide only name, no prediction, '
-            'but predictions are dict. '
-            'Metrics: %s, Labels: %s.' % (metrics, labels_tensor_or_dict))
+        raise ValueError('Metrics passed provide only name, no prediction, '
+                         'but predictions are dict. '
+                         'Metrics: %s, Labels: %s.' % (metrics,
+                                                       labels_tensor_or_dict))
       result[name] = metric(predictions, labels_tensor_or_dict)
   return result
 
@@ -369,9 +370,8 @@ def _write_dict_to_summary(output_dir, dictionary, current_global_step):
       logging.info(
           'Summary for np.ndarray is not visible in Tensorboard by default. '
           'Consider using a Tensorboard plugin for visualization (see '
-          'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md '  # pylint:disable=line-too-long
-          'for more information).'
-      )
+          'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md'
+          ' for more information).')
     else:
       logging.warn(
           'Skipping summary for %s, must be a float, np.float32, np.int64, '
@@ -385,8 +385,8 @@ GraphRewriteSpec = collections.namedtuple('GraphRewriteSpec',
                                           ['tags', 'transforms'])
 
 
-class BaseEstimator(
-    sklearn.BaseEstimator, evaluable.Evaluable, trainable.Trainable):
+class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
+                    trainable.Trainable):
   """Abstract BaseEstimator class to train and evaluate TensorFlow models.
 
   Users should not instantiate or subclass this class. Instead, use an
@@ -428,7 +428,7 @@ class BaseEstimator(
         #                  necessary.
         # pylint: disable=g-doc-exception
         raise ValueError(
-            "model_dir are set both in constructor and RunConfig, but with "
+            'model_dir are set both in constructor and RunConfig, but with '
             "different values. In constructor: '{}', in RunConfig: "
             "'{}' ".format(model_dir, self._config.model_dir))
         # pylint: enable=g-doc-exception
@@ -457,12 +457,16 @@ class BaseEstimator(
     # TODO(wicke): make RunConfig immutable, and then return it without a copy.
     return copy.deepcopy(self._config)
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
+  def fit(self,
+          x=None,
+          y=None,
+          input_fn=None,
+          steps=None,
+          batch_size=None,
+          monitors=None,
+          max_steps=None):
     # pylint: disable=g-doc-args,g-doc-return-or-yield
     """See `Trainable`.
 
@@ -494,13 +498,15 @@ class BaseEstimator(
     logging.info('Loss for final step: %s.', loss)
     return self
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
-  def partial_fit(
-      self, x=None, y=None, input_fn=None, steps=1, batch_size=None,
-      monitors=None):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
+  def partial_fit(self,
+                  x=None,
+                  y=None,
+                  input_fn=None,
+                  steps=1,
+                  batch_size=None,
+                  monitors=None):
     """Incremental fit on a batch of samples.
 
     This method is expected to be called several times consecutively
@@ -536,13 +542,16 @@ class BaseEstimator(
     """
     logging.warning('The current implementation of partial_fit is not optimized'
                     ' for use in a loop. Consider using fit() instead.')
-    return self.fit(x=x, y=y, input_fn=input_fn, steps=steps,
-                    batch_size=batch_size, monitors=monitors)
+    return self.fit(
+        x=x,
+        y=y,
+        input_fn=input_fn,
+        steps=steps,
+        batch_size=batch_size,
+        monitors=monitors)
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
   def evaluate(self,
                x=None,
                y=None,
@@ -584,13 +593,15 @@ class BaseEstimator(
       eval_results.update({'global_step': global_step})
     return eval_results
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('batch_size', None), ('as_iterable', True)
-  )
-  def predict(
-      self, x=None, input_fn=None, batch_size=None, outputs=None,
-      as_iterable=True):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('batch_size', None), ('as_iterable', True))
+  def predict(self,
+              x=None,
+              input_fn=None,
+              batch_size=None,
+              outputs=None,
+              as_iterable=True,
+              iterate_batches=False):
     """Returns predictions for given features.
 
     Args:
@@ -606,6 +617,9 @@ class BaseEstimator(
         for each example until inputs are exhausted. Note: The inputs must
         terminate if you want the iterable to terminate (e.g. be sure to pass
         num_epochs=1 if you are using something like read_batch_features).
+      iterate_batches: If True, yield the whole batch at once instead of
+        decomposing the batch into individual samples. Only relevant when
+        as_iterable is True.
 
     Returns:
       A numpy array of predicted classes or regression values if the
@@ -625,7 +639,8 @@ class BaseEstimator(
         input_fn=input_fn,
         feed_fn=feed_fn,
         outputs=outputs,
-        as_iterable=as_iterable)
+        as_iterable=as_iterable,
+        iterate_batches=iterate_batches)
 
   def get_variable_value(self, name):
     """Returns value of the variable given by name.
@@ -651,16 +666,17 @@ class BaseEstimator(
     return self._model_dir
 
   @deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
-  def export(self,
-             export_dir,
-             input_fn=export._default_input_fn,  # pylint: disable=protected-access
-             input_feature_key=None,
-             use_deprecated_input_fn=True,
-             signature_fn=None,
-             prediction_key=None,
-             default_batch_size=1,
-             exports_to_keep=None,
-             checkpoint_path=None):
+  def export(
+      self,
+      export_dir,
+      input_fn=export._default_input_fn,  # pylint: disable=protected-access
+      input_feature_key=None,
+      use_deprecated_input_fn=True,
+      signature_fn=None,
+      prediction_key=None,
+      default_batch_size=1,
+      exports_to_keep=None,
+      checkpoint_path=None):
     """Exports inference graph into given dir.
 
     Args:
@@ -798,8 +814,8 @@ class BaseEstimator(
       logging.debug('Setting feature info to %s.', str(self._features_info))
     if labels is not None:
       if self._labels_info is not None:
-        logging.debug('Given labels: %s, required signatures: %s.',
-                      str(labels), str(self._labels_info))
+        logging.debug('Given labels: %s, required signatures: %s.', str(labels),
+                      str(self._labels_info))
         if not tensor_signature.tensors_compatible(labels, self._labels_info):
           raise ValueError('Labels are incompatible with given information. '
                            'Given labels: %s, required signatures: %s.' %
@@ -850,13 +866,13 @@ class BaseEstimator(
     if not checkpoint_path:
       latest_path = saver.latest_checkpoint(self._model_dir)
       if not latest_path:
-        raise NotFittedError("Couldn't find trained model at %s."
-                             % self._model_dir)
+        raise NotFittedError(
+            "Couldn't find trained model at %s." % self._model_dir)
       checkpoint_path = latest_path
 
     # Setup output directory.
-    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
-                            'eval_' + name)
+    eval_dir = os.path.join(self._model_dir, 'eval'
+                            if not name else 'eval_' + name)
 
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -879,8 +895,7 @@ class BaseEstimator(
                         'Use steps=None if intended.')
       if steps:
         hooks.append(
-            evaluation.StopAfterNEvalsHook(
-                steps, log_progress=log_progress))
+            evaluation.StopAfterNEvalsHook(steps, log_progress=log_progress))
 
       global_step_key = 'global_step'
       while global_step_key in eval_dict:
@@ -916,8 +931,8 @@ class BaseEstimator(
     # Check that model has been trained.
     checkpoint_path = saver.latest_checkpoint(self._model_dir)
     if not checkpoint_path:
-      raise NotFittedError("Couldn't find trained model at %s."
-                           % self._model_dir)
+      raise NotFittedError(
+          "Couldn't find trained model at %s." % self._model_dir)
 
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -979,7 +994,8 @@ class BaseEstimator(
     existing_keys = predictions.keys()
     predictions = {
         key: value
-        for key, value in six.iteritems(predictions) if key in outputs
+        for key, value in six.iteritems(predictions)
+        if key in outputs
     }
     if not predictions:
       raise ValueError('Expected to run at least one output from %s, '
@@ -1045,8 +1061,7 @@ class BaseEstimator(
           chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks,
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
-          config=self._session_config
-      ) as mon_sess:
+          config=self._session_config) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
           _, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss])
@@ -1137,8 +1152,7 @@ class Estimator(BaseEstimator):
       if params is not None and 'params' not in model_fn_args:
         raise ValueError('Estimator\'s model_fn (%s) does not have a params '
                          'argument, but params (%s) were passed to the '
-                         'Estimator\'s constructor.' %
-                         (model_fn, params))
+                         'Estimator\'s constructor.' % (model_fn, params))
       if params is None and 'params' in model_fn_args:
         logging.warning('Estimator\'s model_fn (%s) includes params '
                         'argument, but params are not passed to Estimator.',
@@ -1192,8 +1206,9 @@ class Estimator(BaseEstimator):
 
     # Custom metrics should overwrite defaults.
     if metrics:
-      model_fn_ops.eval_metric_ops.update(_make_metrics_ops(
-        metrics, features, labels, model_fn_ops.predictions))
+      model_fn_ops.eval_metric_ops.update(
+          _make_metrics_ops(metrics, features, labels,
+                            model_fn_ops.predictions))
 
     return model_fn_ops
 
@@ -1238,8 +1253,8 @@ class Estimator(BaseEstimator):
     Raises:
       ValueError: if `metrics` don't match `labels`.
     """
-    model_fn_ops = self._call_model_fn(
-        features, labels, model_fn_lib.ModeKeys.EVAL, metrics)
+    model_fn_ops = self._call_model_fn(features, labels,
+                                       model_fn_lib.ModeKeys.EVAL, metrics)
 
     if metric_key.MetricKey.LOSS not in model_fn_ops.eval_metric_ops:
       model_fn_ops.eval_metric_ops[metric_key.MetricKey.LOSS] = (
@@ -1263,14 +1278,16 @@ class Estimator(BaseEstimator):
         self._labels_info)
     return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.INFER)
 
-  def export_savedmodel(
-      self, export_dir_base, serving_input_fn,
-      default_output_alternative_key=None,
-      assets_extra=None,
-      as_text=False,
-      checkpoint_path=None,
-      graph_rewrite_specs=(GraphRewriteSpec((tag_constants.SERVING,), ()),),
-      strip_default_attrs=False):
+  def export_savedmodel(self,
+                        export_dir_base,
+                        serving_input_fn,
+                        default_output_alternative_key=None,
+                        assets_extra=None,
+                        as_text=False,
+                        checkpoint_path=None,
+                        graph_rewrite_specs=(GraphRewriteSpec(
+                            (tag_constants.SERVING,), ()),),
+                        strip_default_attrs=False):
     # pylint: disable=line-too-long
     """Exports inference graph as a SavedModel into given dir.
 
@@ -1297,7 +1314,8 @@ class Estimator(BaseEstimator):
         default serving tag ("serve") and no rewriting.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+        [Stripping Default-Valued
+          Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
       The string path to the exported directory.
@@ -1313,8 +1331,8 @@ class Estimator(BaseEstimator):
       # Locate the latest checkpoint
       checkpoint_path = saver.latest_checkpoint(self._model_dir)
     if not checkpoint_path:
-      raise NotFittedError("Couldn't find trained model at %s."
-                           % self._model_dir)
+      raise NotFittedError(
+          "Couldn't find trained model at %s." % self._model_dir)
 
     export_dir = saved_model_export_utils.get_timestamped_export_dir(
         export_dir_base)
@@ -1348,10 +1366,10 @@ class Estimator(BaseEstimator):
           saved_model_export_utils.get_output_alternatives(
               model_fn_ops, default_output_alternative_key))
 
-      init_op = control_flow_ops.group(
-          variables.local_variables_initializer(),
-          resources.initialize_resources(resources.shared_resources()),
-          lookup_ops.tables_initializer())
+      init_op = control_flow_ops.group(variables.local_variables_initializer(),
+                                       resources.initialize_resources(
+                                           resources.shared_resources()),
+                                       lookup_ops.tables_initializer())
 
       # Build the SignatureDefs from all pairs of input and output alternatives
       signature_def_map = saved_model_export_utils.build_all_signature_defs(
@@ -1381,10 +1399,10 @@ class Estimator(BaseEstimator):
 
         # TODO(soergel): switch to main_op or otherwise update when dust settles
         builder.add_meta_graph_and_variables(
-            session, untransformed_tags,
+            session,
+            untransformed_tags,
             signature_def_map=signature_def_map,
-            assets_collection=ops.get_collection(
-                ops.GraphKeys.ASSET_FILEPATHS),
+            assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
             legacy_init_op=init_op,
             strip_default_attrs=strip_default_attrs)
 
@@ -1395,12 +1413,16 @@ class Estimator(BaseEstimator):
     if graph_rewrite_specs[1:]:
       # Prepare the input_names and output_names needed for the
       # meta_graph_transform call below.
-      input_names = [tensor.name
-                     for input_dict in input_alternatives.values()
-                     for tensor in input_dict.values()]
-      output_names = [tensor.name
-                      for output_alternative in output_alternatives.values()
-                      for tensor in output_alternative[1].values()]
+      input_names = [
+          tensor.name
+          for input_dict in input_alternatives.values()
+          for tensor in input_dict.values()
+      ]
+      output_names = [
+          tensor.name
+          for output_alternative in output_alternatives.values()
+          for tensor in output_alternative[1].values()
+      ]
 
     # Write the additional MetaGraphDefs
     for graph_rewrite_spec in graph_rewrite_specs[1:]:
@@ -1419,11 +1441,11 @@ class Estimator(BaseEstimator):
 
     # Add the extra assets
     if assets_extra:
-      assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                       compat.as_bytes('assets.extra'))
+      assets_extra_path = os.path.join(
+          compat.as_bytes(temp_export_dir), compat.as_bytes('assets.extra'))
       for dest_relative, source in assets_extra.items():
-        dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                     compat.as_bytes(dest_relative))
+        dest_absolute = os.path.join(
+            compat.as_bytes(assets_extra_path), compat.as_bytes(dest_relative))
         dest_path = os.path.dirname(dest_absolute)
         gfile.MakeDirs(dest_path)
         gfile.Copy(source, dest_absolute)
@@ -1443,25 +1465,36 @@ class SKCompat(sklearn.BaseEstimator):
 
   def fit(self, x, y, batch_size=128, steps=None, max_steps=None,
           monitors=None):
-    input_fn, feed_fn = _get_input_fn(x, y, input_fn=None, feed_fn=None,
-                                      batch_size=batch_size, shuffle=True,
-                                      epochs=None)
+    input_fn, feed_fn = _get_input_fn(
+        x,
+        y,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=True,
+        epochs=None)
     all_monitors = []
     if feed_fn:
       all_monitors = [basic_session_run_hooks.FeedFnHook(feed_fn)]
     if monitors:
       all_monitors.extend(monitors)
 
-    self._estimator.fit(input_fn=input_fn,
-                        steps=steps,
-                        max_steps=max_steps,
-                        monitors=all_monitors)
+    self._estimator.fit(
+        input_fn=input_fn,
+        steps=steps,
+        max_steps=max_steps,
+        monitors=all_monitors)
     return self
 
   def score(self, x, y, batch_size=128, steps=None, metrics=None, name=None):
-    input_fn, feed_fn = _get_input_fn(x, y, input_fn=None,
-                                      feed_fn=None, batch_size=batch_size,
-                                      shuffle=False, epochs=1)
+    input_fn, feed_fn = _get_input_fn(
+        x,
+        y,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=False,
+        epochs=1)
     if metrics is not None and not isinstance(metrics, dict):
       raise ValueError('Metrics argument should be None or dict. '
                        'Got %s.' % metrics)
@@ -1477,8 +1510,13 @@ class SKCompat(sklearn.BaseEstimator):
 
   def predict(self, x, batch_size=128, outputs=None):
     input_fn, feed_fn = _get_input_fn(
-        x, None, input_fn=None, feed_fn=None, batch_size=batch_size,
-        shuffle=False, epochs=1)
+        x,
+        None,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=False,
+        epochs=1)
     results = list(
         self._estimator._infer_model(
             input_fn=input_fn,
@@ -1489,7 +1527,6 @@ class SKCompat(sklearn.BaseEstimator):
     if not isinstance(results[0], dict):
       return np.concatenate([output for output in results], axis=0)
     return {
-        key: np.concatenate(
-            [output[key] for output in results], axis=0)
+        key: np.concatenate([output[key] for output in results], axis=0)
         for key in results[0]
     }
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
index 9d7c1a099aa4be64ca0296fa5b870597dabec7b4..d4a46b41d0c93ef58d5db8c433cbf348fec10f5e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
@@ -41,7 +41,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import queue_runner_impl
 
-
 _BOSTON_INPUT_DIM = 13
 _IRIS_INPUT_DIM = 4
 
@@ -93,8 +92,8 @@ def boston_eval_fn():
       constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
   labels = array_ops.reshape(
       constant_op.constant(boston.target), [n_examples, 1])
-  return array_ops.concat([features, features], 0), array_ops.concat(
-      [labels, labels], 0)
+  return array_ops.concat([features, features],
+                          0), array_ops.concat([labels, labels], 0)
 
 
 def extract(data, key):
@@ -129,7 +128,10 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -139,7 +141,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -150,7 +155,10 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -173,7 +181,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     del est
     # Create another estimator object with the same output dir.
     est2 = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
@@ -182,7 +192,9 @@ class EstimatorInputTest(test.TestCase):
     scores2 = est2.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertAllClose(scores2['MSE'], scores['MSE'])
     predictions = np.array(list(est2.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions,
@@ -197,7 +209,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston.data)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(scores['MSE'], other_score)
@@ -213,7 +227,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(other_score, scores['MSE'])
@@ -228,14 +244,15 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=iris.data,
         y=iris.target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = est.predict(x=iris.data)
     predictions_class = est.predict(x=iris.data, outputs=['class'])['class']
     self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0])
     self.assertAllClose(predictions['class'], predictions_class)
-    self.assertAllClose(
-        predictions['class'], np.argmax(
-            predictions['prob'], axis=1))
+    self.assertAllClose(predictions['class'],
+                        np.argmax(predictions['prob'], axis=1))
     other_score = _sklearn.accuracy_score(iris.target, predictions['class'])
     self.assertAllClose(scores['accuracy'], other_score)
     self.assertTrue('global_step' in scores)
@@ -250,17 +267,18 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=iris_data,
         y=iris_target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = list(est.predict(x=iris_data))
     predictions_class = list(est.predict(x=iris_data, outputs=['class']))
     self.assertEqual(len(predictions), iris.target.shape[0])
     classes_batch = np.array([p['class'] for p in predictions])
     self.assertAllClose(classes_batch,
                         np.array([p['class'] for p in predictions_class]))
-    self.assertAllClose(
-        classes_batch,
-        np.argmax(
-            np.array([p['prob'] for p in predictions]), axis=1))
+    self.assertAllClose(classes_batch,
+                        np.argmax(
+                            np.array([p['prob'] for p in predictions]), axis=1))
     other_score = _sklearn.accuracy_score(iris.target, classes_batch)
     self.assertAllClose(other_score, scores['accuracy'])
     self.assertTrue('global_step' in scores)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 5f682838b7afadec7a54df782cb5b89ac6746659..d81a534b79bc90fe91ffd3cb97a7865a7cb4c2a9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -111,8 +111,8 @@ def boston_eval_fn():
       constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
   labels = array_ops.reshape(
       constant_op.constant(boston.target), [n_examples, 1])
-  return array_ops.concat([features, features], 0), array_ops.concat(
-      [labels, labels], 0)
+  return array_ops.concat([features, features],
+                          0), array_ops.concat([labels, labels], 0)
 
 
 def extract(data, key):
@@ -147,7 +147,10 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -157,7 +160,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -168,7 +174,10 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -184,14 +193,12 @@ def _build_estimator_for_export_tests(tmpdir):
   def _input_fn():
     iris = base.load_iris()
     return {
-        'feature': constant_op.constant(
-            iris.data, dtype=dtypes.float32)
+        'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
     }, constant_op.constant(
         iris.target, shape=[150], dtype=dtypes.int32)
 
   feature_columns = [
-      feature_column_lib.real_valued_column(
-          'feature', dimension=4)
+      feature_column_lib.real_valued_column('feature', dimension=4)
   ]
 
   est = linear.LinearRegressor(feature_columns)
@@ -291,8 +298,8 @@ class CheckCallsMonitor(monitors_lib.BaseMonitor):
             self.begin_calls == self.expect_calls)
 
 
-def _model_fn_ops(
-    expected_features, expected_labels, actual_features, actual_labels, mode):
+def _model_fn_ops(expected_features, expected_labels, actual_features,
+                  actual_labels, mode):
   assert_ops = tuple([
       check_ops.assert_equal(
           expected_features[k], actual_features[k], name='assert_%s' % k)
@@ -310,11 +317,11 @@ def _model_fn_ops(
 
 
 def _make_input_fn(features, labels):
+
   def _input_fn():
-    return {
-        k: constant_op.constant(v)
-        for k, v in six.iteritems(features)
-    }, constant_op.constant(labels)
+    return {k: constant_op.constant(v)
+            for k, v in six.iteritems(features)}, constant_op.constant(labels)
+
   return _input_fn
 
 
@@ -369,11 +376,13 @@ class EstimatorModelFnTest(test.TestCase):
       self.assertEqual(expected_params, params)
       self.assertTrue(config.i_am_test)
       return _model_fn_ops(features, labels, arg0, arg1, mode)
+
     partial_model_fn = functools.partial(
         _model_fn, foo=expected_foo, bar=expected_bar)
 
     est = estimator.Estimator(
-        model_fn=partial_model_fn, params=expected_params,
+        model_fn=partial_model_fn,
+        params=expected_params,
         config=expected_config)
     self.assertEqual(0, model_fn_call_count[0])
     est.fit(input_fn=_make_input_fn(features, labels), steps=1)
@@ -382,7 +391,12 @@ class EstimatorModelFnTest(test.TestCase):
   def testModelFnWithModelDir(self):
     expected_param = {'some_param': 'some_value'}
     expected_model_dir = tempfile.mkdtemp()
-    def _argument_checker(features, labels, mode, params, config=None,
+
+    def _argument_checker(features,
+                          labels,
+                          mode,
+                          params,
+                          config=None,
                           model_dir=None):
       _, _, _ = features, labels, config
       self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
@@ -390,9 +404,11 @@ class EstimatorModelFnTest(test.TestCase):
       self.assertEqual(model_dir, expected_model_dir)
       return (constant_op.constant(0.), constant_op.constant(0.),
               training_util.get_global_step().assign_add(1))
-    est = estimator.Estimator(model_fn=_argument_checker,
-                              params=expected_param,
-                              model_dir=expected_model_dir)
+
+    est = estimator.Estimator(
+        model_fn=_argument_checker,
+        params=expected_param,
+        model_dir=expected_model_dir)
     est.fit(input_fn=boston_input_fn, steps=1)
 
   def testInvalidModelFn_no_train_op(self):
@@ -447,8 +463,7 @@ class EstimatorModelFnTest(test.TestCase):
       est.predict(input_fn=boston_input_fn)
     with self.assertRaisesRegexp(ValueError, 'Missing prediction'):
       est.predict(
-          input_fn=functools.partial(
-              boston_input_fn, num_epochs=1),
+          input_fn=functools.partial(boston_input_fn, num_epochs=1),
           as_iterable=True)
 
   def testModelFnScaffoldInTraining(self):
@@ -498,15 +513,17 @@ class EstimatorModelFnTest(test.TestCase):
     self.assertTrue(self.mock_saver.restore.called)
     est.predict(input_fn=input_fn)
     self.assertTrue(self.mock_saver.restore.called)
+
     def serving_input_fn():
-      serialized_tf_example = array_ops.placeholder(dtype=dtypes.string,
-                                                    shape=[None],
-                                                    name='input_example_tensor')
+      serialized_tf_example = array_ops.placeholder(
+          dtype=dtypes.string, shape=[None], name='input_example_tensor')
       features, labels = input_fn()
-      return input_fn_utils.InputFnOps(
-          features, labels, {'examples': serialized_tf_example})
+      return input_fn_utils.InputFnOps(features, labels, {
+          'examples': serialized_tf_example
+      })
 
-    est.export_savedmodel(os.path.join(est.model_dir, 'export'), serving_input_fn)
+    est.export_savedmodel(
+        os.path.join(est.model_dir, 'export'), serving_input_fn)
     self.assertTrue(self.mock_saver.restore.called)
 
 
@@ -550,33 +567,28 @@ class EstimatorTest(test.TestCase):
 
   def testRunConfigModelDir(self):
     config = run_config.RunConfig(model_dir='test_dir')
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              config=config)
+    est = estimator.Estimator(model_fn=linear_model_fn, config=config)
     self.assertEqual('test_dir', est.config.model_dir)
     self.assertEqual('test_dir', est.model_dir)
 
   def testModelDirAndRunConfigModelDir(self):
     config = run_config.RunConfig(model_dir='test_dir')
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              config=config,
-                              model_dir='test_dir')
+    est = estimator.Estimator(
+        model_fn=linear_model_fn, config=config, model_dir='test_dir')
     self.assertEqual('test_dir', est.config.model_dir)
 
     with self.assertRaisesRegexp(
-        ValueError,
-        'model_dir are set both in constructor and RunConfig, '
+        ValueError, 'model_dir are set both in constructor and RunConfig, '
         'but with different'):
-      estimator.Estimator(model_fn=linear_model_fn,
-                          config=config,
-                          model_dir='different_dir')
+      estimator.Estimator(
+          model_fn=linear_model_fn, config=config, model_dir='different_dir')
 
   def testModelDirIsCopiedToRunConfig(self):
     config = run_config.RunConfig()
     self.assertIsNone(config.model_dir)
 
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              model_dir='test_dir',
-                              config=config)
+    est = estimator.Estimator(
+        model_fn=linear_model_fn, model_dir='test_dir', config=config)
     self.assertEqual('test_dir', est.config.model_dir)
     self.assertEqual('test_dir', est.model_dir)
 
@@ -656,25 +668,27 @@ class EstimatorTest(test.TestCase):
     boston = base.load_boston()
     output_dir = tempfile.mkdtemp()
     est = estimator.SKCompat(
-        estimator.Estimator(
-            model_fn=linear_model_fn, model_dir=output_dir))
+        estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir))
     float64_labels = boston.target.astype(np.float64)
     est.fit(x=boston.data, y=float64_labels, steps=50)
     scores = est.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     del est
     # Create another estimator object with the same output dir.
     est2 = estimator.SKCompat(
-        estimator.Estimator(
-            model_fn=linear_model_fn, model_dir=output_dir))
+        estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir))
 
     # Check we can evaluate and predict.
     scores2 = est2.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertAllClose(scores['MSE'], scores2['MSE'])
     predictions = np.array(list(est2.predict(x=boston.data)))
     other_score = _sklearn.mean_squared_error(predictions, float64_labels)
@@ -685,14 +699,15 @@ class EstimatorTest(test.TestCase):
     scores3 = est2.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertLess(scores3['MSE'], scores['MSE'])
 
   def test_checkpoint_contains_relative_paths(self):
     tmpdir = tempfile.mkdtemp()
     est = estimator.Estimator(
-        model_dir=tmpdir,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=tmpdir, model_fn=linear_model_fn_with_model_fn_ops)
     est.fit(input_fn=boston_input_fn, steps=5)
 
     checkpoint_file_content = file_io.read_file_to_string(
@@ -700,22 +715,20 @@ class EstimatorTest(test.TestCase):
     ckpt = checkpoint_state_pb2.CheckpointState()
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
-    self.assertAllEqual(
-        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+    self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'],
+                        ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
     tmpdir = tempfile.mkdtemp()
     model_dir1 = os.path.join(tmpdir, 'model_dir1')
     est1 = estimator.Estimator(
-        model_dir=model_dir1,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=model_dir1, model_fn=linear_model_fn_with_model_fn_ops)
     est1.fit(input_fn=boston_input_fn, steps=5)
 
     model_dir2 = os.path.join(tmpdir, 'model_dir2')
     os.renames(model_dir1, model_dir2)
     est2 = estimator.Estimator(
-        model_dir=model_dir2,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=model_dir2, model_fn=linear_model_fn_with_model_fn_ops)
     self.assertEqual(5, est2.get_variable_value('global_step'))
     est2.fit(input_fn=boston_input_fn, steps=5)
     self.assertEqual(10, est2.get_variable_value('global_step'))
@@ -724,7 +737,9 @@ class EstimatorTest(test.TestCase):
     boston = base.load_boston()
     est = estimator.SKCompat(
         estimator.Estimator(
-            model_fn=linear_model_params_fn, params={'learning_rate': 0.01}))
+            model_fn=linear_model_params_fn, params={
+                'learning_rate': 0.01
+            }))
     est.fit(x=boston.data, y=boston.target, steps=100)
 
   def testHooksNotChanged(self):
@@ -824,11 +839,13 @@ class EstimatorTest(test.TestCase):
 
   def testMonitorsForFit(self):
     est = estimator.Estimator(model_fn=linear_model_fn)
-    est.fit(input_fn=boston_input_fn,
-            steps=21,
-            monitors=[CheckCallsMonitor(expect_calls=21)])
+    est.fit(
+        input_fn=boston_input_fn,
+        steps=21,
+        monitors=[CheckCallsMonitor(expect_calls=21)])
 
   def testHooksForEvaluate(self):
+
     class CheckCallHook(session_run_hook.SessionRunHook):
 
       def __init__(self):
@@ -874,7 +891,9 @@ class EstimatorTest(test.TestCase):
     est.evaluate(
         input_fn=boston_input_fn,
         steps=200,
-        metrics={'MSE': _streaming_mean_squared_error_histogram})
+        metrics={
+            'MSE': _streaming_mean_squared_error_histogram
+        })
     events = util_test.latest_events(est.model_dir + '/eval')
     output_values = {}
     for e in events:
@@ -903,7 +922,9 @@ class EstimatorTest(test.TestCase):
     est.evaluate(
         input_fn=boston_input_fn,
         steps=200,
-        metrics={'PMT': _streaming_precition_mean_tensor})
+        metrics={
+            'PMT': _streaming_precition_mean_tensor
+        })
     events = util_test.latest_events(est.model_dir + '/eval')
     output_values = {}
     for e in events:
@@ -956,8 +977,8 @@ class EstimatorTest(test.TestCase):
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1017,11 +1038,11 @@ class EstimatorTest(test.TestCase):
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
         self.assertTrue('linear/linear/feature/matmul' in graph_ops)
-        self.assertItemsEqual(
-          ['bogus_lookup', 'feature'],
-          [compat.as_str_any(x) for x in graph.get_collection(
-            constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS)])
-
+        self.assertItemsEqual(['bogus_lookup', 'feature'], [
+            compat.as_str_any(x)
+            for x in graph.get_collection(
+                constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS)
+        ])
 
     # cleanup
     gfile.DeleteRecursively(tmpdir)
@@ -1039,8 +1060,8 @@ class EstimatorTest(test.TestCase):
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1083,19 +1104,22 @@ class EstimatorTest(test.TestCase):
     export_dir_base = os.path.join(
         compat.as_bytes(tmpdir), compat.as_bytes('export'))
     export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_fn, assets_extra=assets_extra,
+        export_dir_base,
+        serving_input_fn,
+        assets_extra=assets_extra,
         graph_rewrite_specs=[
             estimator.GraphRewriteSpec(['tag_1'], []),
             estimator.GraphRewriteSpec(['tag_2', 'tag_3'],
-                                       ['strip_unused_nodes'])])
+                                       ['strip_unused_nodes'])
+        ])
 
     self.assertTrue(gfile.Exists(export_dir_base))
     self.assertTrue(gfile.Exists(export_dir))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1208,18 +1232,15 @@ class InferRealValuedColumnsTest(test.TestCase):
     self.assertEqual(1, len(feature_columns))
     feature_column = feature_columns[0]
     self.assertEqual('', feature_column.name)
-    self.assertEqual(
-        {
-            '':
-                parsing_ops.FixedLenFeature(
-                    shape=expected_shape, dtype=expected_dtype)
-        },
-        feature_column.config)
+    self.assertEqual({
+        '':
+            parsing_ops.FixedLenFeature(
+                shape=expected_shape, dtype=expected_dtype)
+    }, feature_column.config)
 
   def testInt32Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.int32))
+        np.ones(shape=[7, 8], dtype=np.int32))
     self._assert_single_feature_column([8], dtypes.int32, feature_columns)
 
   def testInt32InputFn(self):
@@ -1229,8 +1250,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testInt64Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.int64))
+        np.ones(shape=[7, 8], dtype=np.int64))
     self._assert_single_feature_column([8], dtypes.int64, feature_columns)
 
   def testInt64InputFn(self):
@@ -1240,8 +1260,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testFloat32Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.float32))
+        np.ones(shape=[7, 8], dtype=np.float32))
     self._assert_single_feature_column([8], dtypes.float32, feature_columns)
 
   def testFloat32InputFn(self):
@@ -1251,8 +1270,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testFloat64Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.float64))
+        np.ones(shape=[7, 8], dtype=np.float64))
     self._assert_single_feature_column([8], dtypes.float64, feature_columns)
 
   def testFloat64InputFn(self):
@@ -1271,8 +1289,8 @@ class InferRealValuedColumnsTest(test.TestCase):
         ValueError, 'on integer or non floating types are not supported'):
       # pylint: disable=g-long-lambda
       estimator.infer_real_valued_columns_from_input_fn(
-          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool),
-                   None))
+          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool), None)
+      )
 
   def testStringInput(self):
     with self.assertRaisesRegexp(
@@ -1309,8 +1327,9 @@ class ReplicaDeviceSetterTest(test.TestCase):
 
   def testVariablesAreOnPs(self):
     tf_config = {'cluster': {run_config.TaskType.PS: ['fake_ps_0']}}
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
@@ -1337,14 +1356,14 @@ class ReplicaDeviceSetterTest(test.TestCase):
 
   def testMutableHashTableIsOnPs(self):
     tf_config = {'cluster': {run_config.TaskType.PS: ['fake_ps_0']}}
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
       default_val = constant_op.constant([-1, -1], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
     self.assertDeviceEqual('/job:ps/task:0', table._table_ref.device)
@@ -1354,8 +1373,7 @@ class ReplicaDeviceSetterTest(test.TestCase):
     with ops.device(
         estimator._get_replica_device_setter(run_config.RunConfig())):
       default_val = constant_op.constant([-1, -1], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
     self.assertDeviceEqual('', table._table_ref.device)
@@ -1371,8 +1389,9 @@ class ReplicaDeviceSetterTest(test.TestCase):
             'index': 3
         }
     }
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
index 8131e0fde6fea5501cacc4714f53ed8d867ca70f..2113fae3940f14c8ca07e5f76986408ae8a33831 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
@@ -72,9 +72,11 @@ class FeatureEngineeringFunctionTest(test.TestCase):
     # predictions = transformed_x (9)
     self.assertEqual(9., prediction)
     metrics = estimator.evaluate(
-        input_fn=input_fn, steps=1,
-        metrics={"label":
-                 metric_spec.MetricSpec(lambda predictions, labels: labels)})
+        input_fn=input_fn,
+        steps=1,
+        metrics={
+            "label": metric_spec.MetricSpec(lambda predictions, labels: labels)
+        })
     # labels = transformed_y (99)
     self.assertEqual(99., metrics["label"])
 
@@ -82,10 +84,10 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     def input_fn():
       return {
-               "x": constant_op.constant(["9."])
-             }, {
-               "y": constant_op.constant(["99."])
-             }
+          "x": constant_op.constant(["9."])
+      }, {
+          "y": constant_op.constant(["99."])
+      }
 
     def feature_engineering_fn(features, labels):
       # Github #12205: raise a TypeError if called twice.
@@ -104,15 +106,17 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
-      model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
+        model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
     estimator.fit(input_fn=input_fn, steps=1)
     prediction = next(estimator.predict(input_fn=input_fn, as_iterable=True))
     # predictions = transformed_x (9)
     self.assertEqual(9., prediction)
     metrics = estimator.evaluate(
-      input_fn=input_fn, steps=1,
-      metrics={"label":
-                 metric_spec.MetricSpec(lambda predictions, labels: labels)})
+        input_fn=input_fn,
+        steps=1,
+        metrics={
+            "label": metric_spec.MetricSpec(lambda predictions, labels: labels)
+        })
     # labels = transformed_y (99)
     self.assertEqual(99., metrics["label"])
 
@@ -150,12 +154,10 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     # predictions = x
     prediction_with_fe_fn = next(
-        estimator_with_fe_fn.predict(
-            input_fn=input_fn, as_iterable=True))
+        estimator_with_fe_fn.predict(input_fn=input_fn, as_iterable=True))
     self.assertEqual(9., prediction_with_fe_fn)
     prediction_without_fe_fn = next(
-        estimator_without_fe_fn.predict(
-            input_fn=input_fn, as_iterable=True))
+        estimator_without_fe_fn.predict(input_fn=input_fn, as_iterable=True))
     self.assertEqual(1., prediction_without_fe_fn)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 656d68b76888d9319c0b9be481f9b0478ac4314c..ac2d10011e222eb9c534d7fbae3c0cb5f4820945 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -57,7 +57,10 @@ def _logistic_regression_model_fn(features, labels, mode):
   predictions = math_ops.sigmoid(logits)
   loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return predictions, loss, train_op
 
 
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 66e15265171679dcd710fdf05bed3105de6bab99..8f6cd39864b437f163dd7c1140dc88755ce98529 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Evaluable` interface."""
 
 from __future__ import absolute_import
@@ -59,9 +58,12 @@ class Evaluable(object):
     for which this evaluation was performed.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
-        containing the input samples for fitting the model. Can be iterator that returns
-        arrays of features or dictionary of array of features. If set, `input_fn` must
+      x: Matrix of shape [n_samples, n_features...] or dictionary of many
+        matrices
+        containing the input samples for fitting the model. Can be iterator that
+          returns
+        arrays of features or dictionary of array of features. If set,
+          `input_fn` must
         be `None`.
       y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
         label values (class labels in classification, real numbers in
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 9576ff21c243022276bb0641882dfaf0decf05c0..bec976afd2719138117976381669ca3292360480 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Experiment class collecting information needed for a single training run."""
 
 from __future__ import absolute_import
@@ -43,7 +42,6 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-
 __all__ = ["Experiment"]
 
 
@@ -278,8 +276,7 @@ class Experiment(object):
     self._train_steps_per_iteration = train_steps_per_iteration
     if (self._train_steps_per_iteration is not None and
         not isinstance(self._train_steps_per_iteration, int)):
-      raise ValueError(
-          "`train_steps_per_iteration` must be an integer.")
+      raise ValueError("`train_steps_per_iteration` must be an integer.")
 
   @property
   def estimator(self):
@@ -359,9 +356,10 @@ class Experiment(object):
           config.cluster_spec and config.master):
         self._start_server()
     elif config.cluster_spec and config.master:
-      raise ValueError('For distributed runtime, Experiment class only works with'
-                       'tf.contrib.learn.RunConfig for now, but provided {}'
-                       .format(type(config)))
+      raise ValueError(
+          "For distributed runtime, Experiment class only works with"
+          "tf.contrib.learn.RunConfig for now, but provided {}".format(
+              type(config)))
 
     extra_hooks = []
     if delay_secs is None:
@@ -414,11 +412,12 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting eval.", delay_secs)
       time.sleep(delay_secs)
 
-    return self._call_evaluate(input_fn=self._eval_input_fn,
-                               steps=self._eval_steps,
-                               metrics=self._eval_metrics,
-                               name=(name or "one_pass"),
-                               hooks=self._eval_hooks)
+    return self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=self._eval_steps,
+        metrics=self._eval_metrics,
+        name=(name or "one_pass"),
+        hooks=self._eval_hooks)
 
   @deprecated(
       "2016-10-23",
@@ -499,15 +498,12 @@ class Experiment(object):
     previous_path = None
     eval_result = None
     last_warning_time = 0
-    while (not predicate_fn or
-           predicate_fn(
-               eval_result,
-               checkpoint_path=previous_path if eval_result else None)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=previous_path if eval_result else None)):
       # Exit if we have already reached number of steps to train.
       if self._has_training_stopped(eval_result):
         logging.info("Exiting continuous eval, global_step=%s >= "
-                     "train_step=%s",
-                     eval_result[ops.GraphKeys.GLOBAL_STEP],
+                     "train_step=%s", eval_result[ops.GraphKeys.GLOBAL_STEP],
                      self._train_steps)
         return
 
@@ -528,12 +524,13 @@ class Experiment(object):
           logging.warning(error_msg)
           last_warning_time = time.time()
       else:
-        eval_result = self._call_evaluate(input_fn=input_fn,
-                                          steps=self._eval_steps,
-                                          metrics=self._eval_metrics,
-                                          name=name,
-                                          checkpoint_path=latest_path,
-                                          hooks=self._eval_hooks)
+        eval_result = self._call_evaluate(
+            input_fn=input_fn,
+            steps=self._eval_steps,
+            metrics=self._eval_metrics,
+            name=name,
+            checkpoint_path=latest_path,
+            hooks=self._eval_hooks)
         # Ensure eval result is not None for next round of evaluation.
         if not eval_result:
           eval_result = {}
@@ -558,8 +555,8 @@ class Experiment(object):
       return False
 
     global_step = eval_result.get(ops.GraphKeys.GLOBAL_STEP)
-    return global_step and self._train_steps and (
-        global_step >= self._train_steps)
+    return global_step and self._train_steps and (global_step >=
+                                                  self._train_steps)
 
   def continuous_eval(self,
                       delay_secs=None,
@@ -678,8 +675,7 @@ class Experiment(object):
       return eval_result, export_results
 
   @experimental
-  def continuous_train_and_eval(self,
-                                continuous_eval_predicate_fn=None):
+  def continuous_train_and_eval(self, continuous_eval_predicate_fn=None):
     """Interleaves training and evaluation.
 
     The frequency of evaluation is controlled by the `train_steps_per_iteration`
@@ -752,10 +748,9 @@ class Experiment(object):
     elif self._train_steps is not None:
       train_steps_per_iteration = int(self._train_steps / 10)
 
-    while (not predicate_fn or
-           predicate_fn(
-               eval_result,
-               checkpoint_path=latest_checkpoint if eval_result else None)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=latest_checkpoint
+        if eval_result else None)):
 
       if self._has_training_stopped(eval_result):
         # Exits once max steps of training is satisfied.
@@ -785,8 +780,7 @@ class Experiment(object):
   def _maybe_export(self, eval_result, checkpoint_path=None):
     """Export the Estimator using export_fn, if defined."""
     export_dir_base = os.path.join(
-        compat.as_bytes(self._estimator.model_dir),
-        compat.as_bytes("export"))
+        compat.as_bytes(self._estimator.model_dir), compat.as_bytes("export"))
 
     export_results = []
     for strategy in self._export_strategies:
@@ -824,10 +818,11 @@ class Experiment(object):
         hooks=self._train_monitors,
         saving_listeners=self._saving_listeners)
 
-    eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
-                                      steps=1,
-                                      metrics=self._eval_metrics,
-                                      name="one_pass")
+    eval_result = self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=1,
+        metrics=self._eval_metrics,
+        name="one_pass")
     _ = self._maybe_export(eval_result)
 
     return eval_result
@@ -849,9 +844,14 @@ class Experiment(object):
     server.start()
     return server
 
-  def _call_train(self, _sentinel=None,  # pylint: disable=invalid-name,
-                  input_fn=None, steps=None, hooks=None, max_steps=None,
-                  saving_listeners=None):
+  def _call_train(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      hooks=None,
+      max_steps=None,
+      saving_listeners=None):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
@@ -867,14 +867,18 @@ class Experiment(object):
           hooks=hooks,
           saving_listeners=saving_listeners)
     else:
-      return self._estimator.fit(input_fn=input_fn,
-                                 steps=steps,
-                                 max_steps=max_steps,
-                                 monitors=hooks)
-
-  def _call_evaluate(self, _sentinel=None,  # pylint: disable=invalid-name,
-                     input_fn=None, steps=None, metrics=None, name=None,
-                     checkpoint_path=None, hooks=None):
+      return self._estimator.fit(
+          input_fn=input_fn, steps=steps, max_steps=max_steps, monitors=hooks)
+
+  def _call_evaluate(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      metrics=None,
+      name=None,
+      checkpoint_path=None,
+      hooks=None):
     if _sentinel is not None:
       raise ValueError("_call_evaluate should be called with keyword args only")
 
@@ -882,18 +886,20 @@ class Experiment(object):
       if metrics is not None:
         raise ValueError(
             "`eval_metrics` must be `None` with `tf.estimator.Estimator`")
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
     else:
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      metrics=metrics,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          metrics=metrics,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index f36a778b529a83f158241ddb060959c4b33e2e95..96be8b1bc402479d5611965f27abb197363cb939 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -35,6 +35,7 @@ from tensorflow.python.platform import tf_logging as logging
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
+
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
@@ -74,11 +75,11 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   if not y_is_dict:
     output_shape = out_el_shape(y_shape, n_classes)
   else:
-    output_shape = dict([
-        (k, out_el_shape(v, n_classes[k]
-                         if n_classes is not None and k in n_classes else None))
-        for k, v in list(y_shape.items())
-    ])
+    output_shape = dict([(k,
+                          out_el_shape(v, n_classes[k]
+                                       if n_classes is not None and
+                                       k in n_classes else None))
+                         for k, v in list(y_shape.items())])
 
   return input_shape, output_shape, batch_size
 
@@ -314,23 +315,23 @@ class DataFeeder(object):
       input_dtype: DType of input (or dictionary of shapes).
       output_dtype: DType of output (or dictionary of shapes.
     """
-    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(
-        y, dict)
+    x_is_dict, y_is_dict = isinstance(
+        x, dict), y is not None and isinstance(y, dict)
     if isinstance(y, list):
       y = np.array(y)
 
     self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())
                    ]) if x_is_dict else check_array(x, x.dtype)
-    self._y = None if y is None else (
-        dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())])
-        if y_is_dict else check_array(y, y.dtype))
+    self._y = None if y is None else (dict(
+        [(k, check_array(v, v.dtype)) for k, v in list(y.items())])
+                                      if y_is_dict else check_array(y, y.dtype))
 
     # self.n_classes is not None means we're converting raw target indices
     # to one-hot.
     if n_classes is not None:
       if not y_is_dict:
-        y_dtype = (np.int64
-                   if n_classes is not None and n_classes > 1 else np.float32)
+        y_dtype = (
+            np.int64 if n_classes is not None and n_classes > 1 else np.float32)
         self._y = (None if y is None else check_array(y, dtype=y_dtype))
 
     self.n_classes = n_classes
@@ -352,8 +353,8 @@ class DataFeeder(object):
     # self._output_dtype == np.float32 when y is None
     self._output_dtype = (
         dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())])
-        if y_is_dict else (
-            _check_dtype(self._y.dtype) if y is not None else np.float32))
+        if y_is_dict else (_check_dtype(self._y.dtype)
+                           if y is not None else np.float32))
 
     # self.n_classes is None means we're passing in raw target indices
     if n_classes is not None and y_is_dict:
@@ -478,8 +479,8 @@ class DataFeeder(object):
 
     # Assign input features from random indices.
     def extract(data, indices):
-      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) if
-              len(data.shape) == 1 else _access(data, indices))
+      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1))
+              if len(data.shape) == 1 else _access(data, indices))
 
     # assign labels from random indices
     def assign_label(data, shape, dtype, n_classes, indices):
@@ -511,16 +512,18 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      x_len = list(self._x.values())[0].shape[
-          0] if x_is_dict else self._x.shape[0]
+      x_len = list(
+          self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
       end = min(x_len, self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
       # adding input placeholder
       feed_dict.update(
           dict([(self._input_placeholder[k].name, extract(v, batch_indices))
-                for k, v in list(self._x.items())]) if x_is_dict else
-          {self._input_placeholder.name: extract(self._x, batch_indices)})
+                for k, v in list(self._x.items())]) if x_is_dict else {
+                    self._input_placeholder.name:
+                        extract(self._x, batch_indices)
+                })
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
@@ -545,7 +548,8 @@ class DataFeeder(object):
                   assign_label(v, shape, dtype, n_classes, batch_indices)
           })
       else:
-        shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes
+        shape, dtype, n_classes = (self.output_shape, self._output_dtype,
+                                   self.n_classes)
         feed_dict.update({
             self._output_placeholder.name:
                 assign_label(self._y, shape, dtype, n_classes, batch_indices)
@@ -621,8 +625,9 @@ class StreamingDataFeeder(DataFeeder):
     elif y is None:
       y_first_el_shape = None
     else:
-      y_first_el_shape = ([1] + list(y_first_el[0].shape if isinstance(
-          y_first_el, list) else y_first_el.shape))
+      y_first_el_shape = (
+          [1] + list(y_first_el[0].shape
+                     if isinstance(y_first_el, list) else y_first_el.shape))
 
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         x_first_el_shape, y_first_el_shape, n_classes, batch_size)
@@ -683,8 +688,8 @@ class StreamingDataFeeder(DataFeeder):
         if shape is None:
           return None
         elif isinstance(shape, dict):
-          return dict([(k, np.zeros(shape[k], dtype[k]))
-                       for k in list(shape.keys())])
+          return dict(
+              [(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())])
         else:
           return np.zeros(shape, dtype=dtype)
 
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 3e0b1ad21a9a4a08fa94c8e9796f2b0dd5f8d622..0948dee7e2fa1b1b3617abd08d2d43ebc5340f63 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Monitors instrument the training process.
 
 @@get_default_monitors
@@ -151,8 +150,8 @@ class BaseMonitor(object):
       ValueError: if we've not begun an epoch, or `epoch` number does not match.
     """
     if self._current_epoch != epoch:
-      raise ValueError(
-          "epoch_end expected %s but got %s.", self._current_epoch, epoch)
+      raise ValueError("epoch_end expected %s but got %s.", self._current_epoch,
+                       epoch)
     self._current_epoch = None
 
   def step_begin(self, step):
@@ -171,8 +170,8 @@ class BaseMonitor(object):
       ValueError: if we've already begun a step, or `step` < 0, or
           `step` > `max_steps`.
     """
-    if (step < 0) or (
-        (self._max_steps is not None) and (step > self._max_steps)):
+    if (step < 0) or ((self._max_steps is not None) and
+                      (step > self._max_steps)):
       raise ValueError("Invalid step %s." % step)
     self._current_step = step
     return []
@@ -203,8 +202,8 @@ class BaseMonitor(object):
       ValueError: if we've not begun a step, or `step` number does not match.
     """
     if self._current_step != step:
-      raise ValueError(
-          "step_end expected %s but got %s.", self._current_step, step)
+      raise ValueError("step_end expected %s but got %s.", self._current_step,
+                       step)
     self._current_step = None
     return False
 
@@ -253,6 +252,7 @@ class EveryN(BaseMonitor):
   treatment.
 
   """
+
   # TODO(ipolosukhin): Add also every n seconds.
 
   def __init__(self, every_n_steps=100, first_n_steps=1):
@@ -475,8 +475,8 @@ class LoggingTrainable(EveryN):
     super(LoggingTrainable, self).every_n_step_begin(step)
     # Get a list of trainable variables at the beginning of every N steps.
     # We cannot get this in __init__ because train_op has not been generated.
-    trainables = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,
-                                    scope=self._scope)
+    trainables = ops.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES, scope=self._scope)
     self._names = {}
     for var in trainables:
       self._names[var.name] = var.value().name
@@ -561,12 +561,19 @@ class ValidationMonitor(EveryN):
   provided.
   """
 
-  def __init__(self, x=None, y=None, input_fn=None, batch_size=None,
+  def __init__(self,
+               x=None,
+               y=None,
+               input_fn=None,
+               batch_size=None,
                eval_steps=None,
-               every_n_steps=100, metrics=None, hooks=None,
+               every_n_steps=100,
+               metrics=None,
+               hooks=None,
                early_stopping_rounds=None,
                early_stopping_metric="loss",
-               early_stopping_metric_minimize=True, name=None):
+               early_stopping_metric_minimize=True,
+               name=None):
     """Initializes a ValidationMonitor.
 
     Args:
@@ -597,8 +604,8 @@ class ValidationMonitor(EveryN):
     Raises:
       ValueError: If both x and input_fn are provided.
     """
-    super(ValidationMonitor, self).__init__(every_n_steps=every_n_steps,
-                                            first_n_steps=-1)
+    super(ValidationMonitor, self).__init__(
+        every_n_steps=every_n_steps, first_n_steps=-1)
     # TODO(mdan): Checks like this are already done by evaluate.
     if x is None and input_fn is None:
       raise ValueError("Either x or input_fn should be provided.")
@@ -654,20 +661,27 @@ class ValidationMonitor(EveryN):
 
   def _evaluate_estimator(self):
     if isinstance(self._estimator, core_estimator.Estimator):
-      if any((x is not None for x in
-              [self.x, self.y, self.batch_size, self.metrics])):
+      if any((x is not None
+              for x in [self.x, self.y, self.batch_size, self.metrics])):
         raise ValueError(
             "tf.estimator.Estimator does not support following "
             "arguments: x, y, batch_size, metrics. Should set as `None` "
             "in ValidationMonitor")
       return self._estimator.evaluate(
-          input_fn=self.input_fn, steps=self.eval_steps, hooks=self.hooks,
+          input_fn=self.input_fn,
+          steps=self.eval_steps,
+          hooks=self.hooks,
           name=self.name)
     else:
       return self._estimator.evaluate(
-          x=self.x, y=self.y, input_fn=self.input_fn,
-          batch_size=self.batch_size, steps=self.eval_steps,
-          metrics=self.metrics, hooks=self.hooks, name=self.name)
+          x=self.x,
+          y=self.y,
+          input_fn=self.input_fn,
+          batch_size=self.batch_size,
+          steps=self.eval_steps,
+          metrics=self.metrics,
+          hooks=self.hooks,
+          name=self.name)
 
   def every_n_step_end(self, step, outputs):
     super(ValidationMonitor, self).every_n_step_end(step, outputs)
@@ -700,8 +714,9 @@ class ValidationMonitor(EveryN):
     # Early stopping logic.
     if self.early_stopping_rounds is not None:
       if self.early_stopping_metric not in validation_outputs:
-        raise ValueError("Metric %s missing from outputs %s." % (
-            self.early_stopping_metric, set(validation_outputs.keys())))
+        raise ValueError("Metric %s missing from outputs %s." %
+                         (self.early_stopping_metric,
+                          set(validation_outputs.keys())))
       current_value = validation_outputs[self.early_stopping_metric]
       if (self._best_value is None or (self.early_stopping_metric_minimize and
                                        (current_value < self._best_value)) or
@@ -712,9 +727,9 @@ class ValidationMonitor(EveryN):
         self._best_value_step = step
       stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
       if stop_now:
-        logging.info("Stopping. Best step: {} with {} = {}."
-                     .format(self._best_value_step,
-                             self.early_stopping_metric, self._best_value))
+        logging.info("Stopping. Best step: {} with {} = {}.".format(
+            self._best_value_step, self.early_stopping_metric,
+            self._best_value))
         self._early_stopped = True
         return True
     return False
@@ -763,8 +778,11 @@ class CaptureVariable(EveryN):
     self._var_values[step] = _extract_output(outputs, self._var_name)
 
 
-def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
-                         output_dir=None, summary_writer=None):
+def get_default_monitors(loss_op=None,
+                         summary_op=None,
+                         save_summary_steps=100,
+                         output_dir=None,
+                         summary_writer=None):
   """Returns a default set of typically-used monitors.
 
   Args:
@@ -782,9 +800,12 @@ def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
   if loss_op is not None:
     monitors.append(PrintTensor(tensor_names={"loss": loss_op.name}))
   if summary_op is not None:
-    monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps,
-                                 output_dir=output_dir,
-                                 summary_writer=summary_writer))
+    monitors.append(
+        SummarySaver(
+            summary_op,
+            save_steps=save_summary_steps,
+            output_dir=output_dir,
+            summary_writer=summary_writer))
   return monitors
 
 
@@ -794,8 +815,10 @@ class GraphDump(BaseMonitor):
   Note, this is very expensive, prefer `PrintTensor` in production.
   """
 
-  IGNORE_OPS = ["Const", "Assign", "Identity", "Placeholder",
-                "RandomUniform", "Cast", "RestoreSlice"]
+  IGNORE_OPS = [
+      "Const", "Assign", "Identity", "Placeholder", "RandomUniform", "Cast",
+      "RestoreSlice"
+  ]
 
   def __init__(self, ignore_ops=None):
     """Initializes GraphDump monitor.
@@ -881,8 +904,8 @@ class ExportMonitor(EveryN):
   """Monitor that exports Estimator every N steps."""
 
   @deprecation.deprecated("2017-03-25",
-              "ExportMonitor is deprecated. Please pass an "
-              "ExportStrategy to Experiment instead.")
+                          "ExportMonitor is deprecated. Please pass an "
+                          "ExportStrategy to Experiment instead.")
   def __init__(self,
                every_n_steps,
                export_dir,
@@ -1088,8 +1111,7 @@ class CheckpointSaver(BaseMonitor):
 class StepCounter(EveryN):
   """Steps per second monitor."""
 
-  def __init__(self, every_n_steps=100, output_dir=None,
-               summary_writer=None):
+  def __init__(self, every_n_steps=100, output_dir=None, summary_writer=None):
     super(StepCounter, self).__init__(every_n_steps=every_n_steps)
     self._summary_tag = "global_step/sec"
     self._last_reported_step = None
@@ -1101,7 +1123,8 @@ class StepCounter(EveryN):
   def set_estimator(self, estimator):
     super(StepCounter, self).set_estimator(estimator)
     if self._summary_writer is None:
-      self._summary_writer = core_summary.FileWriterCache.get(estimator.model_dir)
+      self._summary_writer = core_summary.FileWriterCache.get(
+          estimator.model_dir)
 
   def every_n_step_end(self, current_step, outputs):
     current_time = time.time()
@@ -1109,8 +1132,9 @@ class StepCounter(EveryN):
       added_steps = current_step - self._last_reported_step
       elapsed_time = current_time - self._last_reported_time
       steps_per_sec = added_steps / elapsed_time
-      summary = Summary(value=[Summary.Value(tag=self._summary_tag,
-                                             simple_value=steps_per_sec)])
+      summary = Summary(value=[
+          Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec)
+      ])
       self._summary_writer.add_summary(summary, current_step)
     self._last_reported_step = current_step
     self._last_reported_time = current_time
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 972fec026f25d39dca75e8c5bafffb57fcd323fa..429b6040be21d8cbe1f2bba58090366552fdfbe7 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Trainable` interface."""
 
 from __future__ import absolute_import
@@ -28,18 +27,31 @@ class Trainable(object):
   __metaclass__ = abc.ABCMeta
 
   @abc.abstractmethod
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
+  def fit(self,
+          x=None,
+          y=None,
+          input_fn=None,
+          steps=None,
+          batch_size=None,
+          monitors=None,
+          max_steps=None):
     """Trains a model given training data `x` predictions and `y` labels.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
-         Can be iterator that returns arrays of features or dictionary of arrays of features.
-         The training input samples for fitting the model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
-         Can be iterator that returns array of labels or dictionary of array of labels.
-         The training label values (class labels in classification, real numbers in regression).
-         If set, `input_fn` must be `None`. Note: For classification, label values must
+      x: Matrix of shape [n_samples, n_features...] or the dictionary of
+        Matrices.
+         Can be iterator that returns arrays of features or dictionary of arrays
+           of features.
+         The training input samples for fitting the model. If set, `input_fn`
+           must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the
+        dictionary of same.
+         Can be iterator that returns array of labels or dictionary of array of
+           labels.
+         The training label values (class labels in classification, real numbers
+           in regression).
+         If set, `input_fn` must be `None`. Note: For classification, label
+           values must
          be integers representing the class index (i.e. values from 0 to
          n_classes-1).
       input_fn: Input function returning a tuple of:
diff --git a/tensorflow/contrib/learn/python/learn/utils/export_test.py b/tensorflow/contrib/learn/python/learn/utils/export_test.py
index 95070ada3b9d3ccb00009bd9b885e8163d7fbed4..9bfb1fc952c07bd6c09d1f1074e8dc5539dc0529 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@@ -50,6 +50,7 @@ def _training_input_fn():
 
 
 class ExportTest(test.TestCase):
+
   def _get_default_signature(self, export_meta_filename):
     """ Gets the default signature from the export.meta file. """
     with session.Session():
@@ -69,18 +70,18 @@ class ExportTest(test.TestCase):
     # Only the written checkpoints are exported.
     self.assertTrue(
         saver.checkpoint_exists(os.path.join(export_dir, '00000001', 'export')),
-        'Exported checkpoint expected but not found: %s' %
-        os.path.join(export_dir, '00000001', 'export'))
+        'Exported checkpoint expected but not found: %s' % os.path.join(
+            export_dir, '00000001', 'export'))
     self.assertTrue(
         saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')),
-        'Exported checkpoint expected but not found: %s' %
-        os.path.join(export_dir, '00000010', 'export'))
+        'Exported checkpoint expected but not found: %s' % os.path.join(
+            export_dir, '00000010', 'export'))
     self.assertEquals(
         six.b(os.path.join(export_dir, '00000010')),
         export_monitor.last_export_dir)
     # Validate the signature
     signature = self._get_default_signature(
-      os.path.join(export_dir, '00000010', 'export.meta'))
+        os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField(expected_signature))
 
   def testExportMonitor_EstimatorProvidesSignature(self):
@@ -116,8 +117,7 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     input_feature_key = 'my_example_key'
@@ -160,8 +160,7 @@ class ExportTest(test.TestCase):
           input_feature_key:
               None,
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     monitor = learn.monitors.ExportMonitor(
@@ -182,8 +181,7 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           input_feature_key:
-              array_ops.placeholder(
-                  dtype=dtypes.string, shape=(1,))
+              array_ops.placeholder(dtype=dtypes.string, shape=(1,))
       }, None
 
     monitor = learn.monitors.ExportMonitor(
@@ -204,11 +202,9 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           input_feature_key:
-              array_ops.placeholder(
-                  dtype=dtypes.string, shape=(1,)),
+              array_ops.placeholder(dtype=dtypes.string, shape=(1,)),
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     export_dir = os.path.join(tempfile.mkdtemp(), 'export')
@@ -227,8 +223,8 @@ class ExportTest(test.TestCase):
 
     def _regression_signature(examples, unused_features, predictions):
       signatures = {}
-      signatures['regression'] = (exporter.regression_signature(examples,
-                                                                predictions))
+      signatures['regression'] = (
+          exporter.regression_signature(examples, predictions))
       return signatures['regression'], signatures
 
     random.seed(42)
@@ -248,10 +244,10 @@ class ExportTest(test.TestCase):
     with self.assertRaises(errors.NotFoundError):
       saver.checkpoint_exists(os.path.join(export_dir, '00000000', 'export'))
     self.assertTrue(
-      saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')))
+        saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')))
     # Validate the signature
     signature = self._get_default_signature(
-      os.path.join(export_dir, '00000010', 'export.meta'))
+        os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField('regression_signature'))
 
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index 76cfd88e1d68856907131f7e2bae65d4c9fcc4b1..e7d091e18a8f186f89f5217442c24fb106c5cdab 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -34,12 +34,13 @@ def _create_parser(base_dir):
   # create a simple parser that pulls the export_version from the directory.
   def parser(path):
     # Modify the path object for RegEx match for Windows Paths
-    if os.name == 'nt':
-      match = re.match("^" + compat.as_str_any(base_dir).replace('\\','/') + "/(\\d+)$",
-                      compat.as_str_any(path.path).replace('\\','/'))
+    if os.name == "nt":
+      match = re.match(
+          "^" + compat.as_str_any(base_dir).replace("\\", "/") + "/(\\d+)$",
+          compat.as_str_any(path.path).replace("\\", "/"))
     else:
       match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
-                      compat.as_str_any(path.path))
+                       compat.as_str_any(path.path))
     if not match:
       return None
     return path._replace(export_version=int(match.group(1)))
@@ -63,7 +64,9 @@ class GcTest(test_util.TensorFlowTestCase):
 
   def testModExportVersion(self):
     paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
+        gc.Path("/foo", 4),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
         gc.Path("/foo", 9)
     ]
     mod = gc.mod_export_version(2)
@@ -73,14 +76,21 @@ class GcTest(test_util.TensorFlowTestCase):
 
   def testOneOfEveryNExportVersions(self):
     paths = [
-        gc.Path("/foo", 0), gc.Path("/foo", 1), gc.Path("/foo", 3),
-        gc.Path("/foo", 5), gc.Path("/foo", 6), gc.Path("/foo", 7),
-        gc.Path("/foo", 8), gc.Path("/foo", 33)
+        gc.Path("/foo", 0),
+        gc.Path("/foo", 1),
+        gc.Path("/foo", 3),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
+        gc.Path("/foo", 7),
+        gc.Path("/foo", 8),
+        gc.Path("/foo", 33)
     ]
     one_of = gc.one_of_every_n_export_versions(3)
     self.assertEqual(
         one_of(paths), [
-            gc.Path("/foo", 3), gc.Path("/foo", 6), gc.Path("/foo", 8),
+            gc.Path("/foo", 3),
+            gc.Path("/foo", 6),
+            gc.Path("/foo", 8),
             gc.Path("/foo", 33)
         ])
 
@@ -98,13 +108,19 @@ class GcTest(test_util.TensorFlowTestCase):
     f = gc.union(gc.largest_export_versions(3), gc.mod_export_version(3))
     self.assertEqual(
         f(paths), [
-            gc.Path("/foo", 0), gc.Path("/foo", 3), gc.Path("/foo", 6),
-            gc.Path("/foo", 7), gc.Path("/foo", 8), gc.Path("/foo", 9)
+            gc.Path("/foo", 0),
+            gc.Path("/foo", 3),
+            gc.Path("/foo", 6),
+            gc.Path("/foo", 7),
+            gc.Path("/foo", 8),
+            gc.Path("/foo", 9)
         ])
 
   def testNegation(self):
     paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
+        gc.Path("/foo", 4),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
         gc.Path("/foo", 9)
     ]
     mod = gc.negation(gc.mod_export_version(2))
@@ -121,8 +137,7 @@ class GcTest(test_util.TensorFlowTestCase):
     gfile.MakeDirs(os.path.join(base_dir, "ignore"))
 
     self.assertEqual(
-        gc.get_paths(base_dir, _create_parser(base_dir)),
-        [
+        gc.get_paths(base_dir, _create_parser(base_dir)), [
             gc.Path(os.path.join(base_dir, "0"), 0),
             gc.Path(os.path.join(base_dir, "1"), 1),
             gc.Path(os.path.join(base_dir, "2"), 2)
@@ -131,10 +146,10 @@ class GcTest(test_util.TensorFlowTestCase):
   def testMixedStrTypes(self):
     temp_dir = compat.as_bytes(test.get_temp_dir())
 
-    for sub_dir in ['str', b'bytes', u'unicode']:
+    for sub_dir in ["str", b"bytes", u"unicode"]:
       base_dir = os.path.join(
-          (temp_dir if isinstance(sub_dir, bytes) else temp_dir.decode()),
-          sub_dir)
+          (temp_dir
+           if isinstance(sub_dir, bytes) else temp_dir.decode()), sub_dir)
       self.assertFalse(gfile.Exists(base_dir))
       gfile.MakeDirs(os.path.join(compat.as_str_any(base_dir), "42"))
       gc.get_paths(base_dir, _create_parser(base_dir))
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index fe2f183ac970cef4ebf6ca1a927b5a48eefb7d7b..cea3627ed565f0de86d8d9bb6b45c4b19c5b5558 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -126,6 +126,7 @@ py_library(
 py_test(
     name = "sdca_estimator_test",
     srcs = ["python/sdca_estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":sdca_estimator_py",
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 0b48ef4741ac921e34dd56930783499c5040d581..a1037a525c35b1e3e04e8b53f423696375a6ef84 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -116,25 +116,9 @@ typedef struct {
 } TfLiteAddParams;
 
 typedef struct {
-  // Number of spatial dimensions.
-  // For now only NHWC is supported, and the value should always be 2.
-  int num_spatial_dimensions;
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int block_shape[2];
-  int before_paddings[2];
-  int after_paddings[2];
 } TfLiteSpaceToBatchNDParams;
 
 typedef struct {
-  // Number of spatial dimensions.
-  // For now only NHWC is supported, and the value should always be 2.
-  int num_spatial_dimensions;
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int block_shape[2];
-  int before_crops[2];
-  int after_crops[2];
 } TfLiteBatchToSpaceNDParams;
 
 typedef struct {
@@ -167,8 +151,6 @@ typedef struct {
 } TfLiteLSTMParams;
 
 typedef struct {
-  int new_height;
-  int new_width;
 } TfLiteResizeBilinearParams;
 
 typedef struct {
@@ -206,17 +188,9 @@ typedef struct {
 } TfLiteGatherParams;
 
 typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int perm[8];
-  int num_dimensions;
 } TfLiteTransposeParams;
 
 typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int axis[8];
-  int num_axis_dimensions;
   bool keep_dims;
 } TfLiteMeanParams;
 
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
index 10f31bb6f17242c9f7f70f0648ec643f99c5ac86..d74e275f0439b1ce56b29e0eadff5f211f6a4faa 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -225,14 +225,8 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   assert(pixelBuffer != NULL);
 
   OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
-  int doReverseChannels;
-  if (kCVPixelFormatType_32ARGB == sourcePixelFormat) {
-    doReverseChannels = 1;
-  } else if (kCVPixelFormatType_32BGRA == sourcePixelFormat) {
-    doReverseChannels = 0;
-  } else {
-    assert(false);  // Unknown source format
-  }
+  assert(sourcePixelFormat == kCVPixelFormatType_32ARGB ||
+         sourcePixelFormat == kCVPixelFormatType_32BGRA);
 
   const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
   const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 476d85c0314e331d6d3bad382c331a8458fd01a1..d216cdf69ba8fd3de5a665e5d8b29aa5e01bff13 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -42,7 +42,10 @@ cc_library(
         "bitmap_helpers_impl.h",
         "label_image.h",
     ],
-    deps = ["//tensorflow/contrib/lite:string"],
+    deps = [
+        "//tensorflow/contrib/lite:string",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
 )
 
 # TODO(ahentz): Test disabled as it has a memory leek from read_bmp
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
index 860e27e5ba9cc9fe23d2a7f9f65dd53bbf76f7a3..471fda2ba465aa5ccad2985a063a6855b7488a05 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
@@ -26,15 +26,15 @@ uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
                   int* channels, Settings* s);
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s);
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s);
 
 // explicit instantiation
-template void downsize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int,
-                                int, int, Settings*);
-template void downsize<float>(float*, unsigned char*, int, int, int, int, int,
-                              int, Settings*);
+template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int,
+                              int, int, Settings*);
+template void resize<float>(float*, unsigned char*, int, int, int, int, int,
+                            int, Settings*);
 
 }  // namespace label_image
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
index 64a931082b0cbb4632ec3a814ce654d4f9106bc1..33ea695dda8a27ab2f0dd1c75538833debb26b95 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -16,30 +16,76 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
 #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
 
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
 #include "tensorflow/contrib/lite/examples/label_image/label_image.h"
 
 namespace tflite {
 namespace label_image {
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s) {
-  for (int y = 0; y < wanted_height; ++y) {
-    const int in_y = (y * image_height) / wanted_height;
-    uint8_t* in_row = in + (in_y * image_width * image_channels);
-    T* out_row = out + (y * wanted_width * wanted_channels);
-    for (int x = 0; x < wanted_width; ++x) {
-      const int in_x = (x * image_width) / wanted_width;
-      uint8_t* in_pixel = in_row + (in_x * image_channels);
-      T* out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {
-        if (s->input_floating)
-          out_pixel[c] = (in_pixel[c] - s->input_mean) / s->input_std;
-        else
-          out_pixel[c] = in_pixel[c];
-      }
-    }
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s) {
+
+  int number_of_pixels = image_height * image_width * image_channels;
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+
+  int base_index = 0;
+
+  // two inputs: input and new_sizes
+  interpreter->AddTensors(2, &base_index);
+  // one output
+  interpreter->AddTensors(1, &base_index);
+  // set input and output tensors
+  interpreter->SetInputs({0, 1});
+  interpreter->SetOutputs({2});
+
+  // set paramters of tensors
+  TfLiteQuantizationParams quant;
+  interpreter->SetTensorParametersReadWrite(
+      0, kTfLiteFloat32, "input",
+      {1, image_height, image_width, image_channels}, quant);
+  interpreter->SetTensorParametersReadWrite(1, kTfLiteInt32, "new_size", {2},
+                                            quant);
+  interpreter->SetTensorParametersReadWrite(
+      2, kTfLiteFloat32, "output",
+      {1, wanted_height, wanted_width, wanted_channels}, quant);
+
+  ops::builtin::BuiltinOpResolver resolver;
+  TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR);
+  interpreter->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, nullptr,
+                                     resize_op, nullptr);
+
+  interpreter->AllocateTensors();
+
+  // fill input image
+  // in[] are integers, cannot do memcpy() directly
+  auto input = interpreter->typed_tensor<float>(0);
+  for (int i = 0; i < number_of_pixels; i++) {
+    input[i] = in[i];
+  }
+
+  // fill new_sizes
+  interpreter->typed_tensor<int>(1)[0] = wanted_height;
+  interpreter->typed_tensor<int>(1)[1] = wanted_width;
+
+  interpreter->Invoke();
+
+  auto output = interpreter->typed_tensor<float>(2);
+  auto output_number_of_pixels =
+      wanted_height * wanted_height * wanted_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    if (s->input_floating)
+      out[i] = (output[i] - s->input_mean) / s->input_std;
+    else
+      out[i] = (uint8_t)output[i];
   }
 }
 
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index 4d2e1ce0bc751667393c4b38acc0517980c9f02a..a78900122efa540322a2f80fa3a98e6a8985ddd5 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -148,14 +148,22 @@ void RunInference(Settings* s) {
   int wanted_width = dims->data[2];
   int wanted_channels = dims->data[3];
 
-  if (s->input_floating) {
-    downsize<float>(interpreter->typed_tensor<float>(input), in, image_height,
-                    image_width, image_channels, wanted_height, wanted_width,
-                    wanted_channels, s);
-  } else {
-    downsize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
-                      image_height, image_width, image_channels, wanted_height,
-                      wanted_width, wanted_channels, s);
+  switch (interpreter->tensor(input)->type) {
+    case kTfLiteFloat32:
+      s->input_floating = true;
+      resize<float>(interpreter->typed_tensor<float>(input), in,
+                    image_height, image_width, image_channels,
+                    wanted_height, wanted_width, wanted_channels, s);
+      break;
+    case kTfLiteUInt8:
+      resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
+                      image_height, image_width, image_channels,
+                      wanted_height, wanted_width, wanted_channels, s);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle input type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
   }
 
   struct timeval start_time, stop_time;
@@ -177,13 +185,22 @@ void RunInference(Settings* s) {
 
   std::vector<std::pair<float, int>> top_results;
 
-  if (s->input_floating) {
-    get_top_n<float>(interpreter->typed_output_tensor<float>(0), output_size,
-                     num_results, threshold, &top_results, s->input_floating);
-  } else {
-    get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
+  int output = interpreter->outputs()[0];
+  switch (interpreter->tensor(output)->type) {
+    case kTfLiteFloat32:
+      get_top_n<float>(interpreter->typed_output_tensor<float>(0),
                        output_size, num_results, threshold, &top_results,
-                       s->input_floating);
+                       true);
+      break;
+    case kTfLiteUInt8:
+      get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
+                         output_size, num_results, threshold, &top_results,
+                         false);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle output type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
   }
 
   std::vector<string> labels;
@@ -203,13 +220,11 @@ void display_usage() {
   LOG(INFO) << "label_image\n"
             << "--accelerated, -a: [0|1], use Android NNAPI or note\n"
             << "--count, -c: loop interpreter->Invoke() for certain times\n"
-            << "--input_floating, -f: [0|1] type of input layer is floating "
-               "point numbers\n"
             << "--input_mean, -b: input mean\n"
             << "--input_std, -s: input standard deviation\n"
             << "--image, -i: image_name.bmp\n"
             << "--labels, -l: labels for the model\n"
-            << "--tflite_mode, -m: model_name.tflite\n"
+            << "--tflite_model, -m: model_name.tflite\n"
             << "--threads, -t: number of threads\n"
             << "--verbose, -v: [0|1] print more information\n"
             << "\n";
@@ -223,7 +238,6 @@ int Main(int argc, char** argv) {
     static struct option long_options[] = {
         {"accelerated", required_argument, 0, 'a'},
         {"count", required_argument, 0, 'c'},
-        {"input_floating", required_argument, 0, 'f'},
         {"verbose", required_argument, 0, 'v'},
         {"image", required_argument, 0, 'i'},
         {"labels", required_argument, 0, 'l'},
@@ -254,11 +268,6 @@ int Main(int argc, char** argv) {
         s.loop_count = strtol(  // NOLINT(runtime/deprecated_fn)
             optarg, (char**)NULL, 10);
         break;
-      case 'f':
-        s.input_floating = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
-        s.input_layer_type = "float";
-        break;
       case 'i':
         s.input_bmp_name = optarg;
         break;
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h
index ce98e06fc162a9588707eae701e2fcb8d648a4e4..4de32e33fb4ef2ab5d0e111886cdc737398147e9 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
 #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
 
-#include <string>
 #include "tensorflow/contrib/lite/string.h"
 
+namespace tflite {
+namespace label_image {
+
 struct Settings {
   bool verbose = false;
   bool accel = false;
@@ -33,4 +35,7 @@ struct Settings {
   int number_of_threads = 4;
 };
 
+}  // namespace label_image
+}  // namespace tflite
+
 #endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.md b/tensorflow/contrib/lite/examples/label_image/label_image.md
index d6019d673f1b15429e69b57e8dc9eeaad2825bc3..9ce32cf101897f2d41cd14a485aeb432344928a0 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.md
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.md
@@ -1,8 +1,12 @@
 label_image for TensorFlow Lite inspired by TensorFlow's label_image.
+
+To build label_image for Android, run $TENSORFLOW_ROOT/configure 
+and set Android NDK or configure NDK setting in 
+$TENSORFLOW_ROOT/WORKSPACE first.
  
 To build it for android ARMv8:
 ```
-> bazel build --cxxopt=-std=c++11 \
+> bazel build --config monolithic --cxxopt=-std=c++11 \
   --crosstool_top=//external:android/crosstool \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
   --cpu=arm64-v8a \
@@ -10,13 +14,13 @@ To build it for android ARMv8:
 ```
 or
 ```
-> bazel build --config android_arm64 --cxxopt=-std=c++11 \
+> bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
   //tensorflow/contrib/lite/examples/label_image:label_image
 ```
 
 To build it for android arm-v7a:
 ```
-> bazel build --cxxopt=-std=c++11 \
+> bazel build --config monolithic --cxxopt=-std=c++11 \
   --crosstool_top=//external:android/crosstool \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
   --cpu=armeabi-v7a \
@@ -24,7 +28,7 @@ To build it for android arm-v7a:
 ```
 or
 ```
-> bazel build --config android_arm --cxxopt=-std=c++11 \
+> bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
   //tensorflow/contrib/lite/examples/label_image:label_image
 ```
 
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 69a597dc5a219b55eced6ec8da5b388caf372b8e..a8db149eaaef92e8457afe4976ae356a9f28179f 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -36,6 +36,10 @@ constexpr const int kSlotsToReserve = 128;
 namespace tflite {
 
 // A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
 class InterpreterInfo : public GraphInfo {
  public:
   explicit InterpreterInfo(Interpreter* interpreter)
@@ -45,9 +49,12 @@ class InterpreterInfo : public GraphInfo {
   TfLiteTensor* tensor(size_t index) override {
     return interpreter_->tensor(index);
   }
-  size_t num_nodes() const override { return interpreter_->nodes_size(); }
+  size_t num_nodes() const override {
+    return interpreter_->execution_plan().size();
+  }
   const TfLiteNode& node(size_t index) const override {
-    return interpreter_->node_and_registration(index)->first;
+    int node_index = interpreter_->execution_plan()[index];
+    return interpreter_->node_and_registration(node_index)->first;
   }
   const std::vector<int>& inputs() const override {
     return interpreter_->inputs();
@@ -73,7 +80,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kSlotsToReserve);
   nodes_and_registration_.reserve(kSlotsToReserve);
-  next_node_to_prepare_ = 0;
+  next_execution_plan_index_to_prepare_ = 0;
   UseNNAPI(false);
 }
 
@@ -160,7 +167,7 @@ TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
 }  // namespace
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  next_node_to_prepare_ = 0;
+  next_execution_plan_index_to_prepare_ = 0;
   if (memory_planner_) {
     TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
   }
@@ -190,7 +197,8 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
       &context_,
       CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
 
-  if (node_index) *node_index = nodes_and_registration_.size();
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
   nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
   auto& node_and_reg = nodes_and_registration_.back();
   TfLiteNode& node = node_and_reg.first;
@@ -213,6 +221,7 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
   }
   node.builtin_data = builtin_data_deleter.release();
   node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
   return kTfLiteOk;
 }
 
@@ -240,16 +249,19 @@ bool HasDynamicTensor(const TfLiteContext& context,
   return false;
 }
 
-TfLiteStatus Interpreter::PrepareOpsStartingAt(int first_node,
-                                               int* last_node_prepared) {
-  for (int i = first_node; i < nodes_and_registration_.size(); i++) {
-    TfLiteNode& node = nodes_and_registration_[i].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+TfLiteStatus Interpreter::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
     if (OpPrepare(registration, &node) == kTfLiteError) {
       return kTfLiteError;
     }
 
-    *last_node_prepared = i;
+    *last_execution_plan_index_prepared = execution_plan_index;
 
     // Discontinue if the node has dynamic outputs. Note that we don't
     // stop for dynamic temporary tensors since they won't affect the
@@ -268,14 +280,14 @@ TfLiteStatus Interpreter::PrepareOpsAndTensors() {
     memory_planner_->PlanAllocations();
   }
 
-  int last_node_prepared = 0;
+  int last_exec_plan_index_prepared = 0;
 
-  TF_LITE_ENSURE_STATUS(
-      PrepareOpsStartingAt(next_node_to_prepare_, &last_node_prepared));
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
   TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_node_to_prepare_, last_node_prepared));
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
 
-  next_node_to_prepare_ = last_node_prepared + 1;
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
   return kTfLiteOk;
 }
 
@@ -291,7 +303,8 @@ TfLiteStatus Interpreter::Invoke() {
 
   TfLiteStatus status = kTfLiteOk;
   if (nnapi_delegate_) {
-    if (next_node_to_prepare_ == nodes_and_registration_.size()) {
+    TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
       TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
       return kTfLiteOk;
     } else {
@@ -311,13 +324,17 @@ TfLiteStatus Interpreter::Invoke() {
   // TODO(b/71913981): we should force recalculation in the presence of dynamic
   // tensors, because they may have new value which in turn may affect shapes
   // and allocations.
-  for (int i = 0; i < nodes_and_registration_.size(); i++) {
-    if (i == next_node_to_prepare_) {
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
       TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-      TF_LITE_ENSURE(&context_, next_node_to_prepare_ >= i);
+      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
+                                    execution_plan_index);
     }
-    TfLiteNode& node = nodes_and_registration_[i].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
     if (OpInvoke(registration, &node) == kTfLiteError) {
       status = kTfLiteError;
     }
@@ -421,6 +438,14 @@ TfLiteStatus Interpreter::SetTensorParametersReadWrite(
   return kTfLiteOk;
 }
 
+TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
 TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
                                            TfLiteIntArray* new_size) {
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 4f732769f9f921a9debd5213547d2baccfa69426..c822557d02c826a39aca2cfdac5e906353340edf 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -108,7 +108,7 @@ class Interpreter {
 
   // Adds a node with the given parameters and returns the index of the new
   // node in `node_index` (optionally). Interpreter will take ownership of
-  // `builtin_data` and destroy it with `delete`. Ownership of 'init_data'
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
   // remains with the caller.
   TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
                                      const std::vector<int>& outputs,
@@ -166,12 +166,19 @@ class Interpreter {
   // Return the number of ops in the model.
   int nodes_size() const { return nodes_and_registration_.size(); }
 
+  // WARNING: Experimental interface, subject to change
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
   // Get a tensor data structure.
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
     if (tensor_index >= context_.tensors_size || tensor_index < 0)
-        return nullptr;
+      return nullptr;
     return &context_.tensors[tensor_index];
   }
 
@@ -279,7 +286,8 @@ class Interpreter {
   // dynamic tensors is found or all ops have been prepared. Fill
   // 'last_node_prepared' with the id of the op containing dynamic tensors, or
   // the last in the graph.
-  TfLiteStatus PrepareOpsStartingAt(int first_node, int* last_node_prepared);
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
 
   // Tensors needed by the interpreter. Use `AddTensors` to add more blank
   // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
@@ -354,7 +362,14 @@ class Interpreter {
   // node id, and execute the node to generate the output tensor before continue
   // to allocate successors. This process repeats until all nodes are executed.
   // NOTE: this relies on the order of nodes that is in topological order.
-  int next_node_to_prepare_;
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
 
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index edff2109430c6e1ec6c481619ed7772237a3301d..2ab4bb6567393e812954aadb772636c173ca5324 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -514,6 +514,133 @@ TEST(BasicInterpreter, TestCustomErrorReporter) {
   ASSERT_EQ(reporter.calls, 1);
 }
 
+// Test fixture that allows playing with execution plans. It creates a two
+// node graph that can be executed in either [0,1] order or [1,0] order.
+// The CopyOp records when it is invoked in the class member run_order_
+// so we can test whether the execution plan was honored.
+class TestExecutionPlan : public ::testing::Test {
+  // Encapsulates the node ids and provides them to a C primitive data type
+  // Allocatable with placement new, but never destructed, so make sure this
+  // doesn't own any heap allocated data. This is then is used as op local
+  // data to allow access to the test fixture data.
+  class CallReporting {
+   public:
+    CallReporting(int node_id, std::vector<int>* run_order)
+        : node_id_(node_id), run_order_(run_order) {}
+
+    void Record() { run_order_->push_back(node_id_); }
+
+   private:
+    // The node id for this particular node
+    int node_id_;
+    // A pointer to the global run-order
+    std::vector<int>* run_order_;
+  };
+
+  // Build a kernel registration for an op that copies its one input
+  // to an output
+  TfLiteRegistration CopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      // Set output size to input size
+      TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* tensor1 = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+      return context->ResizeTensor(context, tensor1, newSize);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      CallReporting* call_reporting =
+          reinterpret_cast<CallReporting*>(node->builtin_data);
+      // Copy input data to output data.
+      TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+      int num = a0->dims->data[0];
+      for (int i = 0; i < num; i++) {
+        a1->data.f[i] = a0->data.f[i];
+      }
+      call_reporting->Record();
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  // Adds a copy node going from tensor `input` to output tensor `output`.
+  // Note, input is used as the node_id. Inject run_order as op accessible
+  // data. Note: this is a little strange of a way to do this, but it is
+  // using op functionality to avoid static global variables.
+  void MakeCopyNode(int input, int output) {
+    // Ownership of call_reporting is taken by interpreter (malloc is used due
+    // to nodes being a C99 interface so free() is used).
+    TfLiteRegistration copy_op = CopyOpRegistration();
+    CallReporting* call_reporting_1 =
+        reinterpret_cast<CallReporting*>(malloc(sizeof(CallReporting)));
+    new (call_reporting_1) CallReporting(input, &run_order_);
+    ASSERT_EQ(interpreter_.AddNodeWithParameters(
+                  {0}, {2}, nullptr, 0,
+                  reinterpret_cast<void*>(call_reporting_1), &copy_op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  void SetUp() final {
+    // Add two inputs and two outputs that don't depend on each other
+    ASSERT_EQ(interpreter_.AddTensors(4), kTfLiteOk);
+    interpreter_.SetInputs({0, 1});
+    interpreter_.SetOutputs({2, 3});
+    TfLiteQuantizationParams quantized;
+    for (int tensor_index = 0; tensor_index < 4; tensor_index++) {
+      ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                    tensor_index, kTfLiteFloat32, "", {3}, quantized),
+                kTfLiteOk);
+    }
+
+    // Define two copy functions that also use the user_data to report that
+    // they were called.
+    // i.e. tensor[2] = copy(tensor[0]); tensor[3] = copy(tensor[1]);
+    // thus we can reorder the two nodes arbitrary and still satisfy dependency
+    // order.
+    MakeCopyNode(0, 2);
+    MakeCopyNode(1, 3);
+
+    ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+  }
+
+ protected:
+  Interpreter interpreter_;
+
+  // list of node_ids that were run
+  std::vector<int> run_order_;
+};
+
+TEST_F(TestExecutionPlan, DefaultExecutionPlan) {
+  // Check default order
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({0, 1}));
+}
+
+TEST_F(TestExecutionPlan, ReversedExecutionPlan) {
+  // Check reversed order
+  interpreter_.SetExecutionPlan({1, 0});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1, 0}));
+}
+
+TEST_F(TestExecutionPlan, SubsetExecutionPlan) {
+  // Check running only node index 1
+  interpreter_.SetExecutionPlan({1});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1}));
+}
+
+TEST_F(TestExecutionPlan, NullExecutionPlan) {
+  // Check nothing executed.
+  interpreter_.SetExecutionPlan({});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>());
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 4195e7553c48028d56e80db0d204ef5656be874d..8c40adfae5338ceac9e355483fdbd906ec9f484a 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -71,6 +71,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_util",
+    srcs = [
+        "kernel_util.cc",
+    ],
+    hdrs = [
+        "kernel_util.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite/kernels/internal:round",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_util_test",
+    size = "small",
+    srcs = ["kernel_util_test.cc"],
+    deps = [
+        ":kernel_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = [
@@ -78,6 +104,7 @@ cc_library(
         "add.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
+        "bidirectional_sequence_rnn.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
@@ -87,7 +114,6 @@ cc_library(
         "fully_connected.cc",
         "gather.cc",
         "hashtable_lookup.cc",
-        "kernel_util.cc",
         "l2norm.cc",
         "local_response_norm.cc",
         "lsh_projection.cc",
@@ -111,7 +137,6 @@ cc_library(
         "unidirectional_sequence_rnn.cc",
     ],
     hdrs = [
-        "kernel_util.h",
         "padding.h",
         "register.h",
     ],
@@ -125,6 +150,7 @@ cc_library(
     }),
     deps = [
         ":activation_functor",
+        ":kernel_util",
         ":op_macros",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
@@ -223,6 +249,7 @@ tf_cc_test(
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -263,6 +290,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["bidirectional_sequence_rnn_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 8ac93bc8c8dcfc66d3822e01b6f9b29a3e49c446..3c5c77815d0f2592ab549152b4d77f45b967a660 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -134,8 +134,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::max(0.f, *in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -173,8 +172,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -192,8 +190,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::tanh(*in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 0e10a249abac3ba19cf107e055aa71d1eee00122..63ea89df56bafa995950afec3a58267681af304f 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteAddParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+                  TfLiteAddParams* params, const OpData* data,
+                  TfLiteTensor* input1, TfLiteTensor* input2,
+                  TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_ADD(type)                                        \
-  type::Add(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
-    if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops);
+#define TF_LITE_ADD(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(reference_ops, BroadcastAdd);
     } else {
-      TF_LITE_ADD(optimized_ops);
+      TF_LITE_ADD(reference_ops, Add);
+    }
+  } else {
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(optimized_ops, BroadcastAdd);
+    } else {
+      TF_LITE_ADD(optimized_ops, Add);
+    }
   }
 #undef TF_LITE_ADD
 }
 
 template <KernelType kernel_type>
 void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLiteAddParams* params, TfLiteTensor* input1,
-                      TfLiteTensor* input2, TfLiteTensor* output) {
+                      TfLiteAddParams* params, const OpData* data,
+                      TfLiteTensor* input1, TfLiteTensor* input2,
+                      TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -112,19 +141,20 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_ADD(type)                                                   \
-  type::BroadcastAdd(                                                       \
-      left_shift, GetTensorData<uint8_t>(input1), GetTensorDims(input1),    \
-      input1_offset, input1_multiplier, input1_shift,                       \
-      GetTensorData<uint8_t>(input2), GetTensorDims(input2), input2_offset, \
-      input2_multiplier, input2_shift, output_offset, output_multiplier,    \
-      output_shift, output_activation_min, output_activation_max,           \
-      GetTensorData<uint8_t>(output), GetTensorDims(output));
-
+#define TF_LITE_ADD(type, opname)                                            \
+  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
+               GetTensorDims(input1), input1_offset, input1_multiplier,      \
+               input1_shift, GetTensorData<uint8_t>(input2),                 \
+               GetTensorDims(input2), input2_offset, input2_multiplier,      \
+               input2_shift, output_offset, output_multiplier, output_shift, \
+               output_activation_min, output_activation_max,                 \
+               GetTensorData<uint8_t>(output), GetTensorDims(output));
+  // The quantized version of Add doesn't support activations, so we
+  // always use BroadcastAdd.
   if (kernel_type == kReference) {
-    TF_LITE_ADD(reference_ops);
+    TF_LITE_ADD(reference_ops, BroadcastAdd);
   } else {
-    TF_LITE_ADD(optimized_ops);
+    TF_LITE_ADD(optimized_ops, BroadcastAdd);
   }
 #undef TF_LITE_ADD
 }
@@ -132,15 +162,17 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalAddFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalAddFloat<kernel_type>(context, node, params, data, input1, input2,
+                              output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalAddQuantized<kernel_type>(context, node, params, input1, input2,
+    EvalAddQuantized<kernel_type>(context, node, params, data, input1, input2,
                                   output);
   } else {
     context->ReportError(context,
@@ -154,19 +186,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace add
 
 TfLiteRegistration* Register_ADD_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc
index 306dfc3e803d3df34061767ba9ced032299bfa26..956d05bed5162f6ce59705d59aad77ff056dda77 100644
--- a/tensorflow/contrib/lite/kernels/add_test.cc
+++ b/tensorflow/contrib/lite/kernels/add_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseAddOpModel : public SingleOpModel {
  public:
-  BaseAddOpModel(const TensorData& input, const TensorData& output,
+  BaseAddOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ float GetTolerance(int min, int max) {
 
 TEST(FloatAddOpModel, NoActivation) {
   FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -78,9 +80,9 @@ TEST(FloatAddOpModel, NoActivation) {
 }
 
 TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) {
-  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}},
-                    ActivationFunctionType_RELU_N1_TO_1);
+  FloatAddOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
   m.Invoke();
@@ -92,6 +94,7 @@ TEST(FloatAddOpModel, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -102,6 +105,23 @@ TEST(FloatAddOpModel, VariousInputShapes) {
   }
 }
 
+TEST(FloatAddOpModel, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::initializer_list<float>> inputs1 = {
@@ -112,6 +132,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
@@ -133,6 +154,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
                                                        {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
@@ -150,6 +172,7 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
                           {TensorType_UINT8, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
@@ -162,6 +185,25 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1},
+                                                kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index 3cee43c68b2a0af5a3fd84b33a980b74bb8f0cb4..a0391e030f69e59ed0ecd76ab4ec287a0c415b7a 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -76,8 +76,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
   output_size_array->data[0] = batch_size;
   output_size_array->data[1] = num_units;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
-                                                   output_size_array));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
index 5ecccb985e91238f1183c8f94a2b5f468758ce55..fa7ef525db47c93f98951604cd04da66196422d7 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -120,8 +120,7 @@ static float rnn_golden_output[] = {
 
     0.415153,   0.210318,   0,         0,         0,         0,
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
-    0.628881,   3.58099,    1.49974,   0
-};
+    0.628881,   3.58099,    1.49974,   0};
 
 class RNNOpModel : public SingleOpModel {
  public:
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index 0eed680fdcc2afc4bc72be55a5e7722310fa4538..889239f93215a309d5434b209ebfc1f584c47849 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -35,12 +35,14 @@ enum KernelType {
 
 struct BatchToSpaceNDContext {
   BatchToSpaceNDContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteBatchToSpaceNDParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    crops = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteBatchToSpaceNDParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* crops;
   TfLiteTensor* output;
 };
 
@@ -48,23 +50,28 @@ struct BatchToSpaceNDContext {
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(ycling): Support arbitrary dimension in BatchToSpaceND.
 const int kInputDimensionNum = 4;
-const int kOutputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
 const int kSpatialDimensionNum = 2;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  // The 2nd tensor (block_shape) and the 3rd tensor (crops) are ignored now.
-  TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                BatchToSpaceNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int* crops = GetTensorData<int32>(op_context->crops);
 
-  BatchToSpaceNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions,
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
                     kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  const int* block_shape = op_context.params->block_shape;
+  // TODO(ycling): Add crops as part of calculation. Remove check for a crops
+  // containing all zeroes.
+  TF_LITE_ENSURE_EQ(context, crops[0], 0);
+  TF_LITE_ENSURE_EQ(context, crops[1], 0);
+  TF_LITE_ENSURE_EQ(context, crops[2], 0);
+  TF_LITE_ENSURE_EQ(context, crops[3], 0);
 
   // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
   TF_LITE_ENSURE_EQ(context,
@@ -76,27 +83,48 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int output_width = input_size->data[2] * block_shape[1];
   const int output_channel_size = input_size->data[3];
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   output_size->data[0] = output_batch_size;
   output_size->data[1] = output_height;
   output_size->data[2] = output_width;
   output_size->data[3] = output_channel_size;
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  BatchToSpaceNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.crops)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   BatchToSpaceNDContext op_context(context, node);
 
-  int block_shape_dims_array[1] = {kSpatialDimensionNum};
-  Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1);
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                          \
-  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),          \
-                       GetTensorDims(op_context.input),                  \
-                       op_context.params->block_shape, block_shape_dims, \
-                       GetTensorData<scalar>(op_context.output),         \
+#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                        \
+  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
index 3ec4efbebcef9d55d0042d93007018c9f6ee3b58..8485cde1b40066f2070855bca91ea78a9f80e83c 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -26,36 +26,76 @@ using ::testing::ElementsAreArray;
 
 class BatchToSpaceNDOpModel : public SingleOpModel {
  public:
-  BatchToSpaceNDOpModel(std::initializer_list<int> input_shape,
-                        std::initializer_list<int> block_shape,
-                        std::initializer_list<int> before_crops,
-                        std::initializer_list<int> after_crops) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
-                 BuiltinOptions_BatchToSpaceNDOptions,
-                 CreateBatchToSpaceNDOptions(
-                     builder_, builder_.CreateVector<int>(block_shape),
-                     builder_.CreateVector<int>(before_crops),
-                     builder_.CreateVector<int>(after_crops))
-                     .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetCrops(std::initializer_list<int> data) {
+    PopulateTensor<int>(crops_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int block_shape_;
+  int crops_;
   int output_;
 };
 
-TEST(BatchToSpaceNDOpTest, SimpleTest) {
-  BatchToSpaceNDOpModel m({4, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0});
+// Tests case where block_shape and crops are const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpConstModel m(input_shape, block_shape, crops);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> crops) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and crops are non-const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(crops);
+//    m.Invoke();
+class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    crops_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
@@ -63,11 +103,35 @@ TEST(BatchToSpaceNDOpTest, SimpleTest) {
                                                4, 8, 11, 15, 12, 16}));
 }
 
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpModel({3, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0}),
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
 }
 
+TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 1}),
+               "1 != 0");
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 1, 0});
+  EXPECT_DEATH(m.Invoke(), "1 != 0");
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f54081623578a7b1f37de8d9f111d7950c9e2757
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -0,0 +1,249 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bidirectional_sequence_rnn {
+
+constexpr int kInputTensor = 0;
+// Forward and backward cell tensors.
+constexpr int kFwWeightsTensor = 1;
+constexpr int kFwRecurrentWeightsTensor = 2;
+constexpr int kFwBiasTensor = 3;
+constexpr int kBwWeightsTensor = 4;
+constexpr int kBwRecurrentWeightsTensor = 5;
+constexpr int kBwBiasTensor = 6;
+// State and output tensors.
+constexpr int kFwHiddenStateTensor = 0;
+constexpr int kFwOutputTensor = 1;
+constexpr int kBwHiddenStateTensor = 2;
+constexpr int kBwOutputTensor = 3;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 7);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  TF_LITE_ASSERT_EQ(input->dims->data[2], fw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(input->dims->data[2], bw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(fw_input_weights->dims->data[0], fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_input_weights->dims->data[0], bw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(fw_recurrent_weights->dims->data[0],
+                    fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1],
+                    bw_bias->dims->data[0]);
+
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  // Resize hidden states.
+  TfLiteIntArray* fw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  fw_hidden_state_size_array->data[0] = batch_size;
+  fw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_hidden_state,
+                                                   fw_hidden_state_size_array));
+
+  TfLiteIntArray* bw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  bw_hidden_state_size_array->data[0] = batch_size;
+  bw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_hidden_state,
+                                                   bw_hidden_state_size_array));
+
+  // Mark hidden states as a persistent tensor.
+  fw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+  bw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize outputs.
+  TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
+  fw_output_size_array->data[0] = batch_size;
+  fw_output_size_array->data[1] = max_time;
+  fw_output_size_array->data[2] = fw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, fw_output, fw_output_size_array));
+  TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
+  bw_output_size_array->data[0] = batch_size;
+  bw_output_size_array->data[1] = max_time;
+  bw_output_size_array->data[2] = bw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, bw_output, bw_output_size_array));
+
+  return kTfLiteOk;
+}
+
+namespace {
+// Performs one RNN computation step for the input specified by input_ptr_batch.
+// The RNN cell is specified by the pointers to its weights and biases, along
+// with the input size, number of units, strides, activation.
+// The pointers to the hidden state and the output are updated as a result.
+// TODO(mirkov): factor out this function to a shared library.
+void RnnStep(const float* input_ptr_batch, const float* input_weights_ptr,
+             const float* recurrent_weights_ptr, const float* bias_ptr,
+             int input_size, int num_units, int input_weights_stride,
+             int recurrent_weights_stride, TfLiteFusedActivation activation,
+             float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Output = bias
+  for (int o = 0; o < num_units; o++) {
+    output_ptr_batch[o] = bias_ptr[o];
+  }
+
+  // Output += input * input_weights
+  for (int o = 0; o < num_units; o++) {
+    for (int i = 0; i < input_size; i++) {
+      output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
+    }
+    input_weights_ptr += input_weights_stride;
+  }
+
+  // Output += recurrent_weights * hidden_state
+  for (int o = 0; o < num_units; o++) {
+    for (int h = 0; h < num_units; h++) {
+      output_ptr_batch[o] +=
+          hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
+    }
+    recurrent_weights_ptr += recurrent_weights_stride;
+  }
+
+  // Output = activation(Output) and update hidden_state
+  for (int o = 0; o < num_units; o++) {
+    output_ptr_batch[o] = (ActivationFunctor(activation))(output_ptr_batch[o]);
+    hidden_state_ptr_batch[o] = output_ptr_batch[o];
+  }
+}
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int input_size = input->dims->data[2];
+
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const int fw_input_weights_stride = fw_input_weights->dims->data[1];
+  const int fw_recurrent_weights_stride = fw_recurrent_weights->dims->data[1];
+  const float* fw_bias_ptr = fw_bias->data.f;
+  const float* fw_input_weights_ptr = fw_input_weights->data.f;
+  const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
+
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  const int bw_input_weights_stride = bw_input_weights->dims->data[1];
+  const int bw_recurrent_weights_stride = bw_recurrent_weights->dims->data[1];
+  const float* bw_bias_ptr = bw_bias->data.f;
+  const float* bw_input_weights_ptr = bw_input_weights->data.f;
+  const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
+
+  for (int b = 0; b < batch_size; b++) {
+    // Forward cell.
+    float* fw_hidden_state_ptr_batch =
+        fw_hidden_state->data.f + b * fw_num_units;
+    for (int s = 0; s < max_time; s++) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units;
+
+      RnnStep(input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr,
+              fw_bias_ptr, input_size, fw_num_units, fw_input_weights_stride,
+              fw_recurrent_weights_stride, params->activation,
+              fw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+    // Backward cell.
+    float* bw_hidden_state_ptr_batch =
+        bw_hidden_state->data.f + b * bw_num_units;
+    for (int s = max_time - 1; s >= 0; s--) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units;
+
+      RnnStep(input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr,
+              bw_bias_ptr, input_size, bw_num_units, bw_input_weights_stride,
+              bw_recurrent_weights_stride, params->activation,
+              bw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace bidirectional_sequence_rnn
+
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 bidirectional_sequence_rnn::Prepare,
+                                 bidirectional_sequence_rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12f4ff97cfd90e3a6894a24d15fcbc356f96cde2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -0,0 +1,931 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Bidirectional RNN op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+    0.23689353,   0.285385,     0.037029743, -0.19858193,  -0.27569133,
+    0.43773448,   0.60379338,   0.35562468,  -0.69424844,  -0.93421471,
+    -0.87287879,  0.37144363,   -0.62476718, 0.23791671,   0.40060222,
+    0.1356622,    -0.99774903,  -0.98858172, -0.38952237,  -0.47685933,
+    0.31073618,   0.71511042,   -0.63767755, -0.31729108,  0.33468103,
+    0.75801885,   0.30660987,   -0.37354088, 0.77002847,   -0.62747043,
+    -0.68572164,  0.0069220066, 0.65791464,  0.35130811,   0.80834007,
+    -0.61777675,  -0.21095741,  0.41213346,  0.73784804,   0.094794154,
+    0.47791874,   0.86496925,   -0.53376222, 0.85315156,   0.10288584,
+    0.86684,      -0.011186242, 0.10513687,  0.87825835,   0.59929144,
+    0.62827742,   0.18899453,   0.31440187,  0.99059987,   0.87170351,
+    -0.35091716,  0.74861872,   0.17831337,  0.2755419,    0.51864719,
+    0.55084288,   0.58982027,   -0.47443086, 0.20875752,   -0.058871567,
+    -0.66609079,  0.59098077,   0.73017097,  0.74604273,   0.32882881,
+    -0.17503482,  0.22396147,   0.19379807,  0.29120302,   0.077113032,
+    -0.70331609,  0.15804303,   -0.93407321, 0.40182066,   0.036301374,
+    0.66521823,   0.0300982,    -0.7747041,  -0.02038002,  0.020698071,
+    -0.90300065,  0.62870288,   -0.23068321, 0.27531278,   -0.095755219,
+    -0.712036,    -0.17384434,  -0.50593495, -0.18646687,  -0.96508682,
+    0.43519354,   0.14744234,   0.62589407,  0.1653645,    -0.10651493,
+    -0.045277178, 0.99032974,   -0.88255352, -0.85147917,  0.28153265,
+    0.19455957,   -0.55479527,  -0.56042433, 0.26048636,   0.84702539,
+    0.47587705,   -0.074295521, -0.12287641, 0.70117295,   0.90532446,
+    0.89782166,   0.79817224,   0.53402734,  -0.33286154,  0.073485017,
+    -0.56172788,  -0.044897556, 0.89964068,  -0.067662835, 0.76863563,
+    0.93455386,   -0.6324693,   -0.083922029};
+
+static float rnn_golden_fw_output[] = {
+    0.496726,   0,          0.965996,  0,         0.0584254, 0,
+    0,          0.12315,    0,         0,         0.612266,  0.456601,
+    0,          0.52286,    1.16099,   0.0291232,
+
+    0,          0,          0.524901,  0,         0,         0,
+    0,          1.02116,    0,         1.35762,   0,         0.356909,
+    0.436415,   0.0355727,  0,         0,
+
+    0,          0,          0,         0.262335,  0,         0,
+    0,          1.33992,    0,         2.9739,    0,         0,
+    1.31914,    2.66147,    0,         0,
+
+    0.942568,   0,          0,         0,         0.025507,  0,
+    0,          0,          0.321429,  0.569141,  1.25274,   1.57719,
+    0.8158,     1.21805,    0.586239,  0.25427,
+
+    1.04436,    0,          0.630725,  0,         0.133801,  0.210693,
+    0.363026,   0,          0.533426,  0,         1.25926,   0.722707,
+    0,          1.22031,    1.30117,   0.495867,
+
+    0.222187,   0,          0.72725,   0,         0.767003,  0,
+    0,          0.147835,   0,         0,         0,         0.608758,
+    0.469394,   0.00720298, 0.927537,  0,
+
+    0.856974,   0.424257,   0,         0,         0.937329,  0,
+    0,          0,          0.476425,  0,         0.566017,  0.418462,
+    0.141911,   0.996214,   1.13063,   0,
+
+    0.967899,   0,          0,         0,         0.0831304, 0,
+    0,          1.00378,    0,         0,         0,         1.44818,
+    1.01768,    0.943891,   0.502745,  0,
+
+    0.940135,   0,          0,         0,         0,         0,
+    0,          2.13243,    0,         0.71208,   0.123918,  1.53907,
+    1.30225,    1.59644,    0.70222,   0,
+
+    0.804329,   0,          0.430576,  0,         0.505872,  0.509603,
+    0.343448,   0,          0.107756,  0.614544,  1.44549,   1.52311,
+    0.0454298,  0.300267,   0.562784,  0.395095,
+
+    0.228154,   0,          0.675323,  0,         1.70536,   0.766217,
+    0,          0,          0,         0.735363,  0.0759267, 1.91017,
+    0.941888,   0,          0,         0,
+
+    0,          0,          1.5909,    0,         0,         0,
+    0,          0.5755,     0,         0.184687,  0,         1.56296,
+    0.625285,   0,          0,         0,
+
+    0,          0,          0.0857888, 0,         0,         0,
+    0,          0.488383,   0.252786,  0,         0,         0,
+    1.02817,    1.85665,    0,         0,
+
+    0.00981836, 0,          1.06371,   0,         0,         0,
+    0,          0,          0,         0.290445,  0.316406,  0,
+    0.304161,   1.25079,    0.0707152, 0,
+
+    0.986264,   0.309201,   0,         0,         0,         0,
+    0,          1.64896,    0.346248,  0,         0.918175,  0.78884,
+    0.524981,   1.92076,    2.07013,   0.333244,
+
+    0.415153,   0.210318,   0,         0,         0,         0,
+    0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
+    0.628881,   3.58099,    1.49974,   0};
+
+static float rnn_golden_bw_output[] = {
+    0.496726, 0,          1.00883,   0,         0.0584256, 0,         0,
+    0.236412, 0,          0,         0.612267,  0.487726,  0,         0.54883,
+    1.16099,  0.0291233,  0,         0,         0.428302,  0,         0,
+    0,        0,          1.13262,   0,         1.64415,   0,         0.311249,
+    0.570804, 0.259696,   0,         0,         0,         0,         0,
+    0.262334, 0,          0,         0,         1.23781,   0,         2.86532,
+    0,        0,          1.34389,   2.76409,   0,         0,         1.03969,
+    0,        0.00410865, 0,         0.0470295, 0,         0,         0,
+    0.371556, 0.27175,    1.36614,   1.63956,   0.683887,  1.06176,   0.719552,
+    0.301314, 0.971195,   0,         0.697143,  0,         0.215219,  0.210693,
+    0.363027, 0,          0.501283,  0,         1.13399,   0.623774,  0,
+    1.09851,  1.33313,    0.470441,  0.210965,  0,         0.664178,  0,
+    0.839686, 0,          0,         0.147834,  0,         0,         0,
+    0.58786,  0.490128,   0,         0.905806,  0,         0.932134,  0.424257,
+    0,        0,          0.860629,  0,         0,         0,         0.476425,
+    0,        0.566017,   0.513721,  0.207341,  1.09508,   1.08385,   0,
+    0.973787, 0,          0,         0,         0,         0,         0,
+    1.20698,  0,          0,         0,         1.56135,   1.12369,   0.99588,
+    0.459803, 0,          0.915854,  0,         0,         0,         0,
+    0,        0,          2.03206,   0,         0.773264,  0.267228,  1.55012,
+    1.202,    1.51611,    0.701202,  0,         0.725088,  0,         0.509069,
+    0,        0.671349,   0.581129,  0.343447,  0,         0.107755,  0.611838,
+    1.4331,   1.55871,    0.015242,  0.140624,  0.492562,  0.395095,  0.147722,
+    0,        0.784925,   0,         1.65477,   0.715257,  0,         0,
+    0,        0.685024,   0,         1.89505,   1.00037,   0,         0,
+    0,        0,          0,         1.52659,   0,         0,         0,
+    0,        0.618583,   0,         0.11115,   0,         1.37194,   0.630225,
+    0,        0,          0,         0,         0,         0.0322124, 0,
+    0,        0,          0,         0.430834,  0.252786,  0,         0,
+    0,        0.991297,   1.98451,   0,         0,         0.111511,  0,
+    1.05513,  0,          0,         0,         0,         0,         0,
+    0.290445, 0.412559,   0.0429958, 0.256564,  1.27858,   0.289948,  0,
+    1.01693,  0.327141,   0,         0,         0,         0,         0,
+    1.83508,  0.346248,   0,         0.961535,  0.790026,  0.552203,  2.13457,
+    2.19233,  0.333244,   0.316526,  0.179398,  0,         0,         0,
+    0,        0,          1.86126,   0,         0.728256,  0.750013,  0.011861,
+    0.576383, 3.38891,    1.29273,   0};
+
+constexpr std::initializer_list<float> weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static float endtoend_input[] = {
+    0.996808, 0.060710, 0.981855, 0.570017, 0.525164, 0.796859, 0.696547,
+    0.505925, 0.991844, 0.461208, 0.949371, 0.027624, 0.539236, 0.841854,
+    0.915222, 0.538569, 0.069375, 0.237905, 0.903700, 0.441703, 0.536196,
+    0.402724, 0.761635, 0.025063, 0.082592, 0.688245, 0.239310, 0.256931,
+    0.658900, 0.105695, 0.301983, 0.655708, 0.166405, 0.283837, 0.225725,
+    0.691569, 0.080696, 0.922272, 0.197494, 0.072540, 0.383481, 0.146865,
+    0.100163, 0.922717, 0.988720, 0.015386, 0.461286, 0.058095, 0.253290,
+    0.364986, 0.499797, 0.789487, 0.767709, 0.261433, 0.814549, 0.850302,
+    0.949678, 0.053859, 0.107233, 0.608577, 0.159554, 0.409215, 0.264285,
+    0.325960, 0.693053, 0.490011, 0.017529, 0.773749, 0.412283, 0.215023,
+    0.846288, 0.795764, 0.361889, 0.946452, 0.718481, 0.350608, 0.961837,
+    0.179767, 0.408703, 0.215128, 0.544753, 0.908500, 0.004614, 0.312462,
+    0.169933, 0.819163, 0.162764, 0.119611, 0.873022, 0.269997, 0.728188,
+    0.032576, 0.679212, 0.992474, 0.358536, 0.372265, 0.482484, 0.376065,
+    0.146014, 0.894767, 0.591088, 0.992302, 0.690531, 0.952977, 0.938754,
+    0.409012, 0.303585, 0.900591, 0.588780, 0.712287, 0.115719, 0.133533,
+    0.620788, 0.120334, 0.445995, 0.790720, 0.939497, 0.608759, 0.910331,
+    0.812519, 0.878756, 0.638519, 0.845096, 0.557968, 0.630993, 0.203632,
+    0.930233, 0.113477, 0.579697, 0.076247, 0.008244, 0.170785, 0.068549,
+    0.698776, 0.123761, 0.007303, 0.107788, 0.427346, 0.907894, 0.696568,
+    0.139633, 0.023613, 0.830100, 0.760421, 0.143947, 0.276096, 0.551141,
+    0.083444, 0.884855, 0.461472, 0.895963, 0.763611, 0.099992, 0.741059,
+    0.321579, 0.730984, 0.944691, 0.251812, 0.844461, 0.524388, 0.328059,
+    0.852706, 0.695172, 0.396607, 0.551482, 0.818934, 0.403910, 0.659270,
+    0.246280, 0.311804, 0.355838, 0.385913, 0.335418, 0.185938, 0.146334,
+    0.479364, 0.462034, 0.697475, 0.562808, 0.346888, 0.158948, 0.458771,
+    0.110499, 0.258939, 0.199830, 0.432078, 0.989924, 0.144521, 0.683890,
+    0.834385, 0.668908, 0.011949, 0.687091, 0.364081, 0.408556, 0.238572,
+    0.183015, 0.812466, 0.897842, 0.429294, 0.124271, 0.253680, 0.815207,
+    0.459688, 0.439618, 0.961541, 0.939053, 0.901651, 0.659016, 0.501861,
+    0.248539, 0.817964, 0.960632, 0.359038, 0.076903, 0.160462, 0.791117,
+    0.066826, 0.304983, 0.475007, 0.901211, 0.973891, 0.486955, 0.588302,
+    0.337972, 0.895512, 0.826874, 0.520987, 0.707978, 0.724716, 0.950281,
+    0.832249, 0.978396, 0.765488, 0.291937, 0.418014, 0.727029, 0.230990,
+    0.319665, 0.386045, 0.732850, 0.568204, 0.204009, 0.693482, 0.927242,
+    0.280912, 0.853944, 0.718359, 0.347738, 0.158927, 0.193366, 0.248950,
+    0.132818, 0.680321, 0.837252, 0.470790, 0.575833, 0.664126, 0.991777,
+    0.283811, 0.388843, 0.942058, 0.116060, 0.367239, 0.707546, 0.407997,
+    0.785253, 0.434575, 0.638986, 0.104917, 0.820620, 0.371837, 0.673121,
+    0.024629, 0.065319, 0.600363, 0.305541, 0.919263, 0.318722, 0.653279,
+    0.078190, 0.512088, 0.902229, 0.211009, 0.192409, 0.739480, 0.681799,
+    0.768242, 0.403607, 0.673576, 0.052052, 0.792450, 0.615634, 0.168112,
+    0.159689, 0.323180, 0.576109, 0.944941, 0.757755, 0.215095, 0.049858,
+    0.578375, 0.586932, 0.722979, 0.603003, 0.652251, 0.323343, 0.908544,
+    0.571514, 0.642065, 0.561823, 0.649704, 0.154153, 0.464051, 0.860713,
+    0.346562, 0.203532, 0.542512, 0.114804, 0.607139, 0.216088, 0.166856,
+    0.399588, 0.831722, 0.334968, 0.559277, 0.154902, 0.911077, 0.504218,
+    0.912656, 0.126172, 0.554076, 0.491031, 0.713104, 0.277055, 0.094034,
+    0.365355, 0.600398, 0.002578, 0.936869, 0.242463, 0.564401, 0.586574,
+    0.396616, 0.028452, 0.447287, 0.743178, 0.231984, 0.989799, 0.857982,
+    0.839122, 0.205887, 0.024838, 0.238711, 0.037608, 0.359806, 0.797987,
+    0.192510, 0.270883, 0.302205, 0.105166, 0.397055, 0.856281, 0.596197,
+    0.110160, 0.133336, 0.690231, 0.475515, 0.733734, 0.692809, 0.412384,
+    0.976196, 0.257209, 0.998958, 0.372812, 0.285661, 0.446245, 0.115990,
+    0.517645, 0.436044, 0.973972, 0.356767, 0.641930, 0.998810, 0.595478,
+    0.679539, 0.358617, 0.393465, 0.872049, 0.629500, 0.695670, 0.977215,
+    0.026555, 0.551951, 0.573412, 0.136715, 0.685287, 0.263643, 0.612229,
+    0.419020, 0.956451, 0.024613, 0.395216, 0.213661, 0.023572, 0.768029,
+    0.499322, 0.469816, 0.884019, 0.016967, 0.905860, 0.857991, 0.373734,
+    0.547791, 0.856802, 0.969211, 0.227330, 0.215418, 0.362676, 0.099378,
+    0.844918, 0.058346, 0.076594, 0.871473, 0.610297, 0.650006, 0.008188,
+    0.295583, 0.913648, 0.620417, 0.714603, 0.870100, 0.645031, 0.109820,
+    0.083760, 0.668602, 0.877849, 0.583082, 0.138419, 0.761868, 0.600049,
+    0.044279, 0.619859, 0.973783, 0.592069, 0.476661, 0.942994, 0.819399,
+    0.692079, 0.305670, 0.918778, 0.536997, 0.364016, 0.995371, 0.408470,
+    0.974313, 0.645377, 0.416658, 0.269896, 0.559025, 0.037075, 0.984499,
+    0.429125, 0.682105, 0.094319, 0.512885, 0.350707, 0.972168, 0.095967,
+    0.489126, 0.734035, 0.696016, 0.533405, 0.353894, 0.669799, 0.125474,
+    0.830555, 0.612793, 0.944873, 0.522634, 0.918463, 0.863651, 0.059631,
+    0.282479, 0.859022, 0.468101, 0.256791, 0.504398, 0.884758, 0.526687,
+    0.063423, 0.921833, 0.511186, 0.492548, 0.603939, 0.605505, 0.005433,
+    0.954646, 0.577673, 0.101400, 0.443772, 0.311708, 0.797417, 0.977176,
+    0.665602, 0.467216, 0.102650, 0.496157, 0.080009, 0.047524, 0.018791,
+    0.998471, 0.911174, 0.078422, 0.280950, 0.770196, 0.546523, 0.537741,
+    0.274594, 0.431281, 0.064428, 0.338017, 0.353115, 0.575615, 0.830565,
+    0.957053, 0.181120, 0.835998, 0.911699, 0.758793, 0.937398, 0.355471,
+    0.070501, 0.734815, 0.332647, 0.736103, 0.202031, 0.435297, 0.232261,
+    0.282039, 0.482821, 0.251052, 0.280511, 0.393995, 0.329474, 0.561460,
+    0.164191, 0.875997, 0.099202, 0.438785, 0.307278, 0.163630, 0.776802,
+    0.660393, 0.739244, 0.607367, 0.617446, 0.920364, 0.443365, 0.529145,
+    0.679157, 0.380763, 0.884616, 0.749658, 0.115578, 0.217263, 0.485761,
+    0.317609, 0.652560, 0.718021, 0.599648, 0.135381, 0.969073, 0.880159,
+    0.529376, 0.298547, 0.441619, 0.693567, 0.174544, 0.540821, 0.132351,
+    0.481822, 0.704450, 0.909153, 0.142215, 0.443695, 0.516520, 0.759661,
+    0.364059, 0.959885, 0.288806, 0.043216, 0.340648, 0.173422, 0.792874,
+    0.456226, 0.390685, 0.278634, 0.773834, 0.043245, 0.996656, 0.373483,
+    0.178625, 0.965729, 0.253641, 0.708001, 0.264276, 0.695260, 0.401568,
+    0.438820, 0.236081, 0.533919, 0.920642, 0.940531, 0.443072, 0.062857,
+    0.384226, 0.959592, 0.822518, 0.748285, 0.919477, 0.111325, 0.791501,
+    0.260124, 0.284747, 0.584375, 0.716350, 0.675431, 0.863009, 0.490184,
+    0.718676, 0.859665, 0.863666, 0.897301, 0.825393, 0.117308, 0.605302,
+    0.089669, 0.812568, 0.006870, 0.528489, 0.048649, 0.540788, 0.449131,
+    0.989180, 0.983860, 0.511988, 0.373407, 0.943452, 0.334506, 0.121692,
+    0.862929, 0.445831, 0.913193, 0.123053, 0.730578, 0.497568, 0.839402,
+    0.406009, 0.360577, 0.329586, 0.124685, 0.220241, 0.193253, 0.021986,
+    0.045634, 0.310560, 0.627288, 0.135303, 0.123128, 0.634158, 0.663792,
+    0.171777, 0.174946, 0.112923, 0.160958, 0.158806, 0.624911, 0.534364,
+    0.102259, 0.959418, 0.656056, 0.965187, 0.405249, 0.569249, 0.088240,
+    0.135827, 0.066817, 0.927642, 0.541836, 0.427393, 0.257229, 0.666520,
+    0.647634, 0.450481, 0.688506, 0.693269, 0.761042, 0.315794, 0.828572,
+    0.884170, 0.949952, 0.492364, 0.055947, 0.124898, 0.605288, 0.216905,
+    0.283705, 0.230199, 0.751269, 0.385963, 0.189616, 0.407326, 0.351151,
+    0.594865, 0.976575, 0.439391, 0.730692, 0.043392, 0.367033, 0.272527,
+    0.470785, 0.624261, 0.939048, 0.118419, 0.074743, 0.627554, 0.811688,
+    0.835784, 0.943348, 0.640260, 0.719954, 0.893300, 0.132625, 0.775901,
+    0.018199, 0.737913, 0.992806, 0.301903, 0.968111, 0.744076, 0.687867,
+    0.157728, 0.151401, 0.039017, 0.752593, 0.127976, 0.478408, 0.483284,
+    0.171368, 0.845441, 0.755811, 0.642153, 0.469702, 0.694859, 0.760572,
+    0.544445, 0.322413, 0.572260, 0.380229, 0.265761, 0.212521, 0.100183,
+    0.159062, 0.345146, 0.876084, 0.177261, 0.083058, 0.868891, 0.479164,
+    0.051169, 0.612966, 0.167030, 0.208897, 0.764367, 0.206048, 0.961490,
+    0.892343, 0.684456, 0.444774, 0.063711, 0.529896, 0.200585, 0.705863,
+    0.999598, 0.895444, 0.466435, 0.544043, 0.217857, 0.038696, 0.924272,
+    0.483618, 0.251217, 0.024455, 0.642680, 0.596362, 0.900539, 0.819941,
+    0.679420, 0.769430, 0.299105, 0.730590, 0.382396, 0.466135, 0.939487,
+    0.146763, 0.672183, 0.900977, 0.039106, 0.356638, 0.345750, 0.102817,
+    0.886535, 0.546336, 0.808681, 0.886133, 0.441780, 0.275116, 0.430176,
+    0.659637, 0.313812, 0.354448, 0.143255, 0.565028, 0.378903, 0.785935,
+    0.161391, 0.279443, 0.605876, 0.840811, 0.048873, 0.904980, 0.571401,
+    0.431269, 0.371115, 0.510887, 0.578032, 0.043298, 0.411864, 0.617138,
+    0.399936, 0.757614, 0.719955, 0.286471, 0.303950, 0.528636, 0.172604,
+    0.745730, 0.803752, 0.602780, 0.405367, 0.117564, 0.957228, 0.548622,
+    0.682592, 0.336131, 0.334557, 0.843983, 0.615574, 0.940433, 0.684794,
+    0.664447, 0.845413, 0.256194, 0.095715, 0.216529, 0.767082, 0.673747,
+    0.259827, 0.178946, 0.290885, 0.659763, 0.936560, 0.010840, 0.946234,
+    0.240510, 0.539476, 0.118838, 0.986240, 0.343228, 0.721618, 0.391606,
+    0.460792, 0.678846, 0.940228, 0.143384, 0.014977, 0.274785, 0.987367,
+    0.630551, 0.215218, 0.672161, 0.294998, 0.060631, 0.928355, 0.390713,
+    0.277160, 0.695436, 0.064460, 0.536987, 0.874382, 0.355345, 0.196751,
+    0.810942, 0.366185, 0.142985, 0.051452, 0.905661, 0.261823, 0.037691,
+    0.248889, 0.983441, 0.429297, 0.709681, 0.662286, 0.369525, 0.853066,
+    0.677263, 0.644310, 0.840433, 0.307814, 0.859528, 0.512593, 0.602812,
+    0.920160, 0.440948, 0.993525, 0.197320, 0.136384, 0.057984, 0.734307,
+    0.010766, 0.413329, 0.931058, 0.821707, 0.779514, 0.074043, 0.873159,
+    0.685175, 0.335865, 0.910850, 0.934065, 0.319306, 0.340147, 0.643746,
+    0.981592, 0.709673, 0.496812, 0.658856, 0.353983, 0.337245, 0.966670,
+    0.213511, 0.849838, 0.569482, 0.133671, 0.290786, 0.563007, 0.330991,
+    0.427170, 0.620991, 0.065299, 0.437936, 0.034320, 0.996356, 0.259643,
+    0.813834, 0.070399, 0.132802, 0.499009, 0.406265, 0.043652, 0.433074,
+    0.725570, 0.383800, 0.076820, 0.707163, 0.093473, 0.573632, 0.366018,
+    0.447456, 0.910877, 0.332688, 0.660967, 0.760714, 0.902170, 0.794638,
+    0.051500, 0.465177, 0.125630, 0.478670, 0.086168, 0.190928, 0.916605,
+    0.120488, 0.187285, 0.176248, 0.934322, 0.257684, 0.309050, 0.433331,
+    0.663949, 0.352703, 0.866405, 0.389519, 0.736502, 0.943226, 0.096682,
+    0.829975, 0.516858, 0.462700, 0.277430, 0.427734, 0.795388, 0.938398,
+    0.188449, 0.697558, 0.733036, 0.239948, 0.162735, 0.858666, 0.718618,
+    0.248903, 0.049594, 0.635223, 0.369391, 0.236879, 0.811472, 0.303713,
+    0.494563, 0.120522, 0.737044, 0.158511, 0.473225, 0.603450, 0.548030,
+    0.209727, 0.546675, 0.644712, 0.039702, 0.063533, 0.107412, 0.317132,
+    0.491267, 0.902800, 0.255530, 0.679716, 0.600359, 0.988566, 0.919664,
+    0.763094, 0.847232, 0.638283, 0.011997, 0.896825, 0.273506, 0.381388,
+    0.133704, 0.084978, 0.685101, 0.628267, 0.205500, 0.422145, 0.786778,
+    0.678725, 0.025595, 0.334808, 0.888452, 0.572271, 0.979520, 0.928154,
+    0.635804, 0.086932, 0.245286, 0.127071, 0.989732, 0.500816, 0.806787,
+    0.590091, 0.489382, 0.726451, 0.353185, 0.336614, 0.364734, 0.365182,
+    0.233439, 0.638240, 0.746570, 0.367143, 0.723218, 0.431671, 0.995410,
+    0.928718, 0.853816, 0.782188, 0.607442, 0.879411, 0.116995, 0.495894,
+    0.451682, 0.096515, 0.424048, 0.087485, 0.183447, 0.669334, 0.214556,
+    0.173179, 0.170151, 0.021343, 0.763269, 0.659533, 0.747794, 0.116454,
+    0.996147, 0.112528, 0.481635, 0.229586, 0.750768, 0.228205, 0.596730,
+    0.473985, 0.659876, 0.592139, 0.402703, 0.513692, 0.374327, 0.010145,
+    0.393103, 0.491322, 0.506039, 0.844785, 0.587837, 0.930088, 0.932270,
+    0.771284, 0.599422, 0.146826, 0.944463, 0.769573, 0.168169, 0.707732,
+    0.429106, 0.915964, 0.824186, 0.425253, 0.028492, 0.305821, 0.654839,
+    0.779259, 0.534026, 0.251569, 0.253245, 0.193901, 0.843708, 0.655947,
+    0.707593, 0.218035, 0.666093, 0.100696, 0.709357, 0.172132, 0.945481,
+    0.297195, 0.102220, 0.877751, 0.068479, 0.701642, 0.024577, 0.012941,
+    0.471215, 0.192747, 0.720673, 0.900321, 0.108710, 0.544859, 0.325574,
+    0.137202, 0.850679, 0.980413, 0.916462, 0.384705, 0.231982, 0.169706,
+    0.578607, 0.075690, 0.825654, 0.286200, 0.293725, 0.491746, 0.386896,
+    0.003083, 0.663878, 0.332377, 0.300278, 0.766098, 0.210128, 0.368756,
+    0.467740, 0.234705, 0.381697, 0.938955, 0.427451, 0.102370, 0.839275,
+    0.536162, 0.647229, 0.164849, 0.673364, 0.497908, 0.145262, 0.589825,
+    0.882613, 0.377244, 0.759532, 0.461220, 0.452934, 0.585185, 0.747420,
+    0.746660, 0.076932, 0.134316, 0.749743, 0.740810, 0.466692, 0.050020,
+    0.506908, 0.676820, 0.418776, 0.974648, 0.911525, 0.800474, 0.913602,
+    0.338976, 0.902844, 0.752878, 0.875138, 0.550072, 0.917727, 0.548502,
+    0.047981, 0.062989, 0.138327, 0.930594, 0.440233, 0.897859, 0.391814,
+    0.893168, 0.483044, 0.139234, 0.639828, 0.559975, 0.273549, 0.389570,
+    0.300785, 0.740242, 0.439590, 0.807693, 0.417062, 0.858367, 0.782341,
+    0.328586, 0.658840, 0.695943, 0.667562, 0.561684, 0.448821, 0.542700,
+    0.111756, 0.366548, 0.091202, 0.159737, 0.429537, 0.229529, 0.090331,
+    0.869770, 0.127388, 0.482145, 0.762938, 0.610432, 0.621379, 0.402765,
+    0.170407, 0.894928, 0.792336, 0.471192, 0.635170, 0.231926, 0.278886,
+    0.052232, 0.090293, 0.061226, 0.380818, 0.749133, 0.757170, 0.048380,
+    0.310817, 0.205990, 0.591080, 0.422573, 0.572538, 0.682282, 0.582310,
+    0.002075, 0.911812, 0.672641, 0.871845, 0.039199, 0.154786, 0.634783,
+    0.649631, 0.776165, 0.037548, 0.820038, 0.671093, 0.829884, 0.291231,
+    0.306263, 0.061810, 0.570116, 0.358495, 0.152103, 0.631343, 0.739313,
+    0.901236, 0.388512, 0.787693, 0.212053, 0.594503, 0.378773, 0.634626,
+    0.167040, 0.061056, 0.216937, 0.169115, 0.972867, 0.889578, 0.040960,
+    0.012067, 0.044364, 0.675743, 0.661698, 0.820529, 0.713291, 0.481736,
+    0.491623, 0.543175, 0.772966, 0.797886, 0.604985, 0.343083, 0.156380,
+    0.757088, 0.974425, 0.895693, 0.658324, 0.362938, 0.683386, 0.870376,
+    0.957440, 0.062159, 0.505002, 0.124481, 0.123215, 0.721939, 0.293596,
+    0.096082, 0.611517, 0.334556, 0.108149, 0.655881, 0.010299, 0.769846,
+    0.476411, 0.723590, 0.251582, 0.968033, 0.266765, 0.024548, 0.765919,
+    0.871750, 0.367631, 0.922299, 0.628838, 0.342056, 0.817992, 0.287162,
+    0.704994, 0.501378, 0.157538, 0.662434, 0.563537, 0.662541, 0.786915,
+    0.686752, 0.384480, 0.080511, 0.782834, 0.995997, 0.415067, 0.890983,
+    0.651878, 0.425365, 0.660829, 0.128289, 0.148956, 0.912411, 0.096322,
+    0.415721, 0.936959, 0.862241, 0.287471, 0.304590, 0.784540, 0.916309,
+    0.646646, 0.602533, 0.203471, 0.351640, 0.103911, 0.361009, 0.014074,
+    0.667448, 0.023550, 0.800989, 0.354200, 0.408030, 0.881500, 0.137034,
+    0.404026, 0.296566, 0.028017, 0.055904, 0.721932, 0.688846, 0.184193,
+    0.870887, 0.601257, 0.280515, 0.286608, 0.538216, 0.142755, 0.574079,
+    0.842806, 0.927296, 0.490388, 0.489452, 0.529828, 0.693859, 0.841092,
+    0.633739, 0.054869, 0.855167, 0.301187, 0.078419, 0.656156, 0.655388,
+    0.486448, 0.537656, 0.792422, 0.890475, 0.834222, 0.820439, 0.946379,
+    0.556153, 0.509285, 0.130571, 0.427041, 0.110542, 0.411086, 0.713648,
+    0.648758, 0.553842, 0.287727, 0.491563, 0.481137, 0.778116, 0.981015,
+    0.010966, 0.471975, 0.822107, 0.644705, 0.526844, 0.677274, 0.945892,
+    0.605263, 0.333430, 0.601280, 0.091711, 0.871086, 0.393702, 0.982186,
+    0.705307, 0.214141, 0.928564, 0.261461, 0.723426, 0.059136, 0.688501,
+    0.833968, 0.470222, 0.402150, 0.482725, 0.024063, 0.689877, 0.974289,
+    0.505201, 0.467993, 0.955304, 0.516166, 0.939968, 0.777411, 0.160871,
+    0.466812, 0.454685, 0.106763, 0.072075, 0.788115, 0.708043, 0.163786,
+    0.659201, 0.101744, 0.145971, 0.364508, 0.315885, 0.074536, 0.625969,
+    0.039311, 0.133672, 0.314471, 0.873279, 0.603893, 0.716620, 0.356004,
+    0.627957, 0.406498, 0.330292, 0.133157, 0.874490, 0.285596, 0.649324,
+    0.814458, 0.063007, 0.810195, 0.281270, 0.517693, 0.916958, 0.353345,
+    0.305808, 0.625000, 0.517131, 0.965009, 0.726745, 0.663102, 0.329518,
+    0.042630, 0.737638, 0.955487, 0.081940, 0.871310, 0.269957, 0.955219,
+    0.475203, 0.986578, 0.311223, 0.103160, 0.393075, 0.641515, 0.236317,
+    0.267566, 0.927112, 0.885641, 0.082024, 0.990119, 0.695835, 0.363295,
+    0.507812, 0.612793, 0.716640, 0.813620, 0.237793, 0.233770, 0.778629,
+    0.964538, 0.896872, 0.108147, 0.007167, 0.634510, 0.063633, 0.089108,
+    0.505820, 0.333591, 0.044327, 0.981023, 0.320168, 0.355550, 0.084182,
+    0.713244, 0.997065, 0.320499, 0.980810, 0.924177, 0.206140, 0.062834,
+    0.914296, 0.901975, 0.426129, 0.422107, 0.514768, 0.142768, 0.235727,
+    0.752561, 0.376539, 0.014356, 0.717099, 0.273411, 0.122502, 0.724266,
+    0.907921, 0.186136, 0.813374, 0.413741, 0.519726, 0.857701, 0.394764,
+    0.839895, 0.213251, 0.478946, 0.553139, 0.210317, 0.799446, 0.533948,
+    0.134493, 0.005586, 0.596782, 0.048789, 0.907561, 0.022911, 0.470896,
+    0.422329, 0.165679, 0.706623, 0.174890, 0.542218, 0.720979, 0.891989,
+    0.815629, 0.843481, 0.616255, 0.723551, 0.029617, 0.429630, 0.137292,
+    0.549343, 0.287331, 0.532056, 0.389238, 0.500583, 0.011002, 0.942377,
+    0.710899, 0.810448, 0.476326, 0.845392, 0.816033, 0.073108, 0.894181,
+    0.723594, 0.096019, 0.365077, 0.145923, 0.261699, 0.071700, 0.320813,
+    0.803917, 0.792679, 0.212802, 0.619546, 0.636160, 0.829057, 0.343096,
+    0.665777, 0.258687, 0.480388, 0.215121, 0.546018, 0.012444, 0.604359,
+    0.046601, 0.023446, 0.546736, 0.757500, 0.833893, 0.023062, 0.602892,
+    0.649927, 0.096170, 0.497074, 0.373521, 0.192189, 0.862151, 0.519444,
+    0.453887, 0.933851, 0.840257, 0.257804, 0.726531, 0.053058, 0.877350,
+    0.362691, 0.882115, 0.220446, 0.028468, 0.140802, 0.700834, 0.243589,
+    0.686821, 0.713278, 0.847948, 0.733421, 0.736723, 0.394684, 0.490921,
+    0.570617, 0.417746, 0.093813, 0.220543, 0.513916, 0.590887, 0.594064,
+    0.706105, 0.453038, 0.113508, 0.159992, 0.386889, 0.953765, 0.417796,
+    0.113420, 0.006823, 0.295146, 0.476111, 0.888938, 0.515592, 0.504579,
+    0.029741, 0.216426, 0.748168, 0.716561, 0.929703, 0.596117, 0.449982,
+    0.666427, 0.990801, 0.940903, 0.237043, 0.408547, 0.034717, 0.457587,
+    0.922463, 0.625603, 0.051651, 0.628568, 0.078641, 0.165159, 0.788560,
+    0.465530, 0.118923, 0.206356, 0.578950, 0.125746, 0.501502, 0.055060,
+    0.014685, 0.017094, 0.559640, 0.044425, 0.233519, 0.307808, 0.760986,
+    0.163223, 0.903925, 0.210969, 0.829650, 0.894726, 0.151872, 0.066693,
+    0.303273, 0.186589, 0.524279, 0.225736, 0.812192, 0.575930, 0.854304,
+    0.890833, 0.741089, 0.642864, 0.356363, 0.860012, 0.849220, 0.935313,
+    0.985758, 0.350722, 0.990373, 0.000443, 0.367815, 0.550013, 0.044868,
+    0.601335, 0.857820, 0.805855, 0.764557, 0.761745, 0.016823, 0.594207,
+    0.656471, 0.168696, 0.660900, 0.959744, 0.355284, 0.185179, 0.185480,
+    0.167477, 0.761110, 0.039784, 0.058310, 0.502199, 0.682648, 0.414673,
+    0.362211, 0.531868, 0.349985, 0.347969, 0.882589, 0.340358, 0.348412,
+    0.250404, 0.890371, 0.393280, 0.851739, 0.748191, 0.199135, 0.616297,
+    0.509936, 0.215958, 0.210504, 0.166407, 0.384654, 0.871404, 0.126151,
+    0.739938, 0.056583, 0.311631, 0.907415, 0.817693, 0.351415, 0.965724,
+    0.319891, 0.034062, 0.380397, 0.682102, 0.565930, 0.730382, 0.030072,
+    0.448519, 0.070741, 0.378484, 0.698924, 0.961112, 0.771764, 0.550663,
+    0.709303, 0.970899, 0.166959, 0.219239, 0.186857, 0.377463, 0.385647,
+    0.571511, 0.248867, 0.511798, 0.311449, 0.305450, 0.823429, 0.218864,
+    0.123142, 0.174844, 0.184588, 0.443034, 0.208906, 0.564986, 0.125136,
+    0.774836, 0.295368, 0.155207, 0.223355, 0.366109, 0.533691, 0.922279,
+    0.327221, 0.305455, 0.472942, 0.036524, 0.276354, 0.639901, 0.255763,
+    0.463211, 0.017364, 0.641410, 0.034722, 0.266231, 0.153207, 0.346171,
+    0.571680, 0.976636, 0.565036, 0.694822, 0.151480, 0.749624, 0.137856,
+    0.360386, 0.314610, 0.262992, 0.135222, 0.609978, 0.418200, 0.358578,
+    0.976087, 0.951891, 0.280856, 0.303307, 0.257346, 0.753798, 0.339831,
+    0.533700, 0.393699, 0.595594, 0.996911, 0.411063, 0.237003, 0.031634,
+    0.677294, 0.390211, 0.377805, 0.248974, 0.366847, 0.942841, 0.943796,
+    0.518327, 0.692465, 0.081653, 0.878713, 0.007074, 0.344645, 0.013936,
+    0.617052, 0.762845, 0.372513, 0.593138, 0.714736, 0.653370, 0.896446,
+    0.972082, 0.407168, 0.236276, 0.505782, 0.800867, 0.831870, 0.502693,
+    0.211930, 0.068873, 0.534327, 0.889224, 0.459084, 0.912132, 0.138197,
+    0.825931, 0.854972, 0.081994, 0.344259, 0.547437, 0.163646, 0.222972,
+    0.554511, 0.508291, 0.236908, 0.171563, 0.271135, 0.609421, 0.764701,
+    0.985871, 0.262790, 0.661147, 0.957953, 0.669958, 0.897423, 0.463734,
+    0.470825, 0.729293, 0.966427, 0.682755, 0.798166, 0.500754, 0.571978,
+    0.257251, 0.412886, 0.710176, 0.083182, 0.267858, 0.792169, 0.427441,
+    0.815295, 0.955815, 0.650413, 0.369805, 0.464106, 0.887320, 0.541368,
+    0.735242, 0.496741, 0.306069, 0.721113, 0.759531, 0.967216, 0.679065,
+    0.429489, 0.864639, 0.142799, 0.900314, 0.593932, 0.109227, 0.583069,
+    0.392098, 0.609981, 0.155047, 0.649349, 0.022867, 0.865222, 0.732531,
+    0.290725, 0.657392, 0.159972, 0.106019, 0.613207, 0.810384, 0.475824,
+    0.077313, 0.697704, 0.017192, 0.812555};
+
+static float golden_endtoend_output[] = {
+    -1.881211, -0.028385, -3.585066, 1.939770,  -3.461155, 1.280415,  -4.408978,
+    0.608663,  -2.704937, 1.859742,  -5.777429, 2.691839,  -1.049012, 1.640870,
+    -4.856245, 1.604236,  0.992707,  0.422858,  -4.307465, 1.887332,  -0.884831,
+    -0.154277, -2.634801, 0.586827,  -1.849960, 1.399608,  -4.531559, 1.943591,
+    0.271676,  -2.893054, -2.066826, 0.235467,  -1.248263, -1.164534, -2.640174,
+    -0.112878, -4.386484, 1.253024,  -4.135623, 1.068984,  -0.043579, -0.832957,
+    -3.257258, -0.514396, -1.651174, 0.638630,  -4.364372, 1.548441,  -0.289455,
+    0.539845,  -4.097627, 0.635001,  -0.465071, -0.927701, -2.481498, 0.356616,
+    -2.355012, 0.728806,  -3.340283, 1.609038,  -4.786268, -0.532272, -1.886150,
+    0.254797,  0.746620,  -1.657134, -3.264265, 0.525551,  -1.756837, 0.845446,
+    -5.572190, 1.715797,  -2.856942, 3.394245,  -5.803662, 2.281806,  -3.014739,
+    2.616136,  -4.728482, 1.659984,  -2.106307, 2.711709,  -6.173832, 1.352869,
+    -0.038035, 0.107619,  -4.279774, 2.341930,  -0.980413, -0.119538, -4.049717,
+    1.172128,  -3.477744, 2.602274,  -6.231380, 2.537300,  -0.862214, 0.568722,
+    -3.858362, 0.197867,  -1.725885, 3.687312,  -7.067363, 2.403544,  -0.944963,
+    0.235639,  -3.250094, 0.659117,  -1.459576, 0.426128,  -3.637207, 1.030386,
+    -4.224351, 3.516220,  -6.053367, 0.993473,  -2.182416, -0.762625, -1.884405,
+    -0.113736, -2.572602, 0.329290,  -1.913233, 0.517418,  -0.019757, 0.203176,
+    -3.715881, 0.482136,  -1.912823, 1.357907,  -5.473043, 1.714658,  -3.177160,
+    0.089285,  -3.127669, 1.268076,  0.772498,  -1.622712, -3.850314, 0.436124,
+    -1.495983, 3.439982,  -7.623405, 1.726721,  -0.423979, 0.180201,  -2.902406,
+    0.986457,  -1.845638, 0.460903,  -5.359343, -1.133931, -1.074456, 0.717304,
+    -3.519856, 1.012126,  -0.562301, 1.881967,  -6.716627, 2.525036,  0.945480,
+    0.337081,  -5.210562, 2.572035,  -0.943370, 0.442026,  -2.666313, 0.411296,
+    0.002787,  -0.000735, -2.498933, 0.771719,  -3.568153, 3.833721,  -6.617026,
+    2.813922,  -0.573970, 1.025208,  -3.909923, 1.722648,  -1.406849, 0.719783,
+    -5.207438, 1.819442,  -0.530895, -0.010887, -2.939614, 0.971225,  -1.660297,
+    1.345243,  -4.454571, 2.244876,  -2.021213, 1.756090,  -4.880947, 0.364597,
+    -2.380270, 2.763117,  -5.613013, 2.137534,  0.289101,  -2.279400, -3.365582,
+    0.170028,  -1.142254, -0.709604, -3.656223, 1.804870,  -0.854690, 0.592102,
+    -5.010415, 2.462687,  -1.474710, 0.566002,  -3.621819, -0.391946, -0.423524,
+    -0.631428, -3.513310, 0.962825,  -1.480262, 0.319791,  -3.610137, 1.842339,
+    -0.250073, 1.182022,  -6.249267, 1.604172,  1.153759,  -0.734054, -4.620415,
+    -0.030858, 0.050911,  1.524406,  -4.724010, 1.451846,  -3.277104, 2.414182,
+    -4.605285, 1.846092,  -1.503047, -0.618200, -2.746546, -0.459332, -0.980326,
+    -1.199977, -2.043865, -0.165793, -2.214698, 3.108281,  -7.127830, -0.123065,
+    1.244948,  -3.039923, -4.660061, -0.225957, -0.307210, -1.513205, -2.456005,
+    0.840048,  -0.741445, 2.328635,  -6.015267, 2.723240,  -1.381171, -0.728878,
+    -5.114925, -0.362034, -0.574923, 0.518080,  -3.892457, 1.798948,  0.435119,
+    -0.371696, -2.807571, 1.302864,  -2.063052, 1.036388,  -4.232038, 1.397059,
+    -1.615668, -1.511019, -3.095508, 1.290955,  -3.428723, 2.000287,  -4.196487,
+    1.566983,  0.196957,  0.224343,  -4.926359, -0.691975, -0.214941, 1.546821,
+    -5.384868, 2.290820,  -1.878865, 0.493692,  -4.129823, 2.112036,  0.516558,
+    -2.553077, -2.717338, 0.017146,  -2.016057, 1.628995,  -4.240602, 1.189533,
+    -5.460220, 1.254738,  -4.214903, 0.755659,  -2.893235, 2.937762,  -6.169453,
+    2.035456,  -5.613212, -0.122254, -1.973646, -0.060619, -2.119598, 1.413512,
+    -4.938738, 1.890244,  0.544169,  -2.062413, -3.329637, -0.062515, -1.855805,
+    -0.791297, -2.570353, 0.607615,  0.305812,  0.338930,  -4.150270, 2.274937,
+    0.042653,  0.133825,  -3.538155, 1.523639,  -3.173690, -1.496599, -2.414655,
+    0.464687,  -1.448998, -0.368907, -3.520129, 0.203382,  -2.443626, 1.266233,
+    -3.393848, 0.605911,  -0.015353, 1.402006,  -4.441003, 1.419281,  0.603587,
+    0.434146,  -4.966566, 2.171872,  -0.688264, -0.009981, -4.461103, 1.538354,
+    -5.029816, -0.264424, -1.713510, -0.315258, -1.891606, 0.252074,  -2.419428,
+    0.043970,  -1.291143, 2.048704,  -4.590105, 0.524734,  -1.889576, 0.134836,
+    -3.462745, 1.390663,  -0.112773, 0.402735,  -4.203784, 1.381043,  -1.201634,
+    -1.968277, -1.425637, -0.181725, -1.250742, -2.102041, -3.925464, -1.256797,
+    -3.701354, -1.754610, -1.917231, -1.455910, -1.838006, 2.041781,  -5.666212,
+    2.752957,  -2.659553, 2.553637,  -4.872212, 1.443437,  -2.081846, 3.311263,
+    -5.912457, 1.871049,  0.196148,  -0.307044, -4.024967, 2.149149,  0.361809,
+    0.620415,  -5.939984, 0.180672,  -1.209180, -0.269122, -3.240285, 1.460315,
+    -1.040803, 1.125700,  -6.060366, 0.887767,  -3.214111, 1.314368,  -3.026808,
+    1.023640,  -3.815175, 1.795642,  -4.355603, 1.064454,  -0.046472, 0.618463,
+    -5.941646, 2.861891,  -2.852155, -0.990457, -2.624445, 1.794494,  -1.176747,
+    -0.358159, -3.206776, 1.138721,  -2.819523, -1.825522, -1.450902, -0.187312,
+    -0.808727, 0.636872,  -4.120567, 1.192623,  0.810731,  -1.768519, -3.699450,
+    1.527116,  -2.772720, 3.012835,  -5.912736, 1.599365,  -4.696381, 2.234591,
+    -4.139552, 1.061768,  -1.880089, 3.596274,  -7.006379, 2.382152,  -3.158115,
+    3.844430,  -7.044156, 2.307596,  -2.473970, 1.312644,  -5.467269, 0.197154,
+    -1.530040, 1.762275,  -5.550757, 0.630276,  -3.048947, 1.043777,  -3.096658,
+    1.345893,  -1.329494, 2.065748,  -4.711032, 2.227600,  -0.413321, -0.032428,
+    -4.599650, 1.668734,  -4.351490, -0.200022, -2.359903, 0.021997,  0.116028,
+    1.159718,  -5.093972, -0.142951, -2.409895, 0.906133,  -2.728812, 0.809932,
+    -2.597363, 0.494130,  -2.357861, 0.369825,  -2.165235, 1.148522,  -3.130562,
+    0.759034,  0.646335,  -1.463660, -3.508299, 1.059679,  -1.485465, 1.007319,
+    -4.340716, 1.789864,  -1.590654, 1.612324,  -4.452007, 2.389805,  -5.200148,
+    -1.068398, -1.306923, -0.472408, -0.392165, -0.524996, -2.933478, 1.518430,
+    -1.287781, 0.113422,  -3.020525, 1.338359,  -0.105982, 0.936014,  -4.132197,
+    1.836807,  -0.616589, -1.029716, -3.271347, 0.284889,  -2.653359, 2.135829,
+    -4.643613, 1.627981,  0.287733,  -2.017263, -2.776574, 1.184792,  1.004161,
+    -1.483019, -4.339290, -0.787322, 0.582420,  1.137839,  -5.673941, -0.001862,
+    -1.219142, 0.532561,  -4.457245, 1.826807,  -3.343291, 3.034610,  -6.179855,
+    2.235917,  -4.369989, 4.018128,  -6.632714, 0.926585,  -0.485469, 0.536073,
+    -4.179557, 1.489637,  -0.521762, 1.636089,  -6.137912, 1.500867,  -4.086009,
+    1.961372,  -3.688977, 1.358220,  -1.544034, 1.763837,  -4.357567, 1.852201,
+    -2.018725, 1.046264,  -6.211127, 1.609419,  -0.118441, 1.602284,  -6.242423,
+    1.518578,  -0.604078, 1.106613,  -5.393445, 2.595629,  0.142712,  -1.903953,
+    -2.821177, 0.032758,  -0.009152, 0.184628,  -4.227636, 2.046843,  -2.240138,
+    1.256176,  -5.108516, -0.308447, -2.998571, 4.657396,  -7.582112, 2.510951,
+    -3.535784, 1.704560,  -5.068484, 1.318466,  -3.058265, 3.073172,  -6.998089,
+    3.178849,  -2.420286, 2.277806,  -4.999528, 1.423890,  -1.672914, 0.447460,
+    -4.088940, 1.351087,  -1.051546, -0.417955, -4.042147, 1.604102,  -1.700931,
+    2.796663,  -6.497579, 2.857974,  -0.240828, 0.858001,  -5.778933, 2.778508,
+    -0.406211, 1.300766,  -5.073671, 2.089362,  -0.201673, 1.588396,  -6.000150,
+    2.185055,  -2.332125, 0.768216,  -2.609184, 0.327277,  -3.358943, -1.020736,
+    -2.389984, 0.315512,  -0.561905, 1.948740,  -6.408485, 2.231985,  -0.603652,
+    0.661829,  -5.070386, -1.063058, -0.624796, 1.375772,  -4.379606, 1.929358,
+    -1.047263, 0.739100,  -5.217857, 2.127625,  -5.025338, 0.650344,  -2.068460,
+    0.076936,  -0.457505, -1.050984, -1.917765, 1.150908,  0.782625,  0.855595,
+    -5.321719, 0.787209,  -0.460232, 1.106736,  -5.552326, 2.801043,  -0.360217,
+    -0.434432, -4.273378, 0.967556,  -0.972652, 0.874811,  -5.429918, -0.331039,
+    0.115477,  0.111883,  -5.418786, 1.240546,  -1.842794, 0.505880,  -3.676064,
+    -0.682369, 1.858984,  -0.742566, -5.784060, 0.673239,  -1.280398, 0.280842,
+    -4.848077, 2.214860,  -0.785100, -0.588488, -2.438206, 0.786651,  -1.568752,
+    1.935400,  -6.320256, 2.125338,  -1.476457, -1.651941, -2.695734, 0.007338,
+    -3.280860, 2.310385,  -5.319578, 1.890123,  -0.775723, 0.630606,  -4.321582,
+    1.085521,  -1.847371, 1.188521,  -4.596577, 2.056443,  -2.340172, -0.108501,
+    -3.156392, 0.933279,  -0.495331, 0.122405,  -5.171133, 1.763245,  -0.796913,
+    2.310487,  -7.247197, 2.401678,  -1.908860, 0.043798,  -2.393796, 0.573806,
+    -0.608531, 0.154710,  -4.669001, 0.750680,  0.468380,  0.392591,  -4.755001,
+    2.615217,  -1.957774, 1.153513,  -4.530099, 1.124362,  -3.569415, 1.697154,
+    -3.536335, 0.910758,  -2.976264, 1.833129,  -4.287203, -0.547050, -2.409768,
+    0.061585,  -1.324116, 0.268497,  -2.962222, -1.524245, -2.063413, 0.442058,
+    -4.292337, 3.538863,  -6.699603, 1.718664,  -2.290363, 1.994596,  -6.245037,
+    -0.433084, -0.367059, 1.020297,  -4.940721, 2.902264,  -0.577056, -0.709887,
+    -5.001413, -0.268316, -1.112048, -1.083307, -1.753492, 0.209973,  0.139540,
+    0.917602,  -5.232745, 2.538467,  -2.139234, -0.187388, -1.837249, -0.478582,
+    -0.731653, -0.481550, -2.531261, 1.044770,  0.707750,  0.279971,  -3.221119,
+    1.552074,  -2.373144, 0.859518,  -3.665156, 1.620278,  -1.440871, -0.525581,
+    -2.758271, 1.491873,  -2.302013, 1.119935,  -5.257080, 2.627170,  -3.174739,
+    1.363282,  -4.831639, 1.101076,  -4.337008, 2.689639,  -5.165915, 1.069201,
+    -1.882078, -0.120370, -2.287967, 1.147619,  -1.403616, 1.077150,  -5.084296,
+    1.658236,  -0.919642, 0.487423,  -3.001075, 0.741268,  0.107300,  0.943556,
+    -3.544311, 1.000239,  -1.627171, 2.871253,  -5.179172, 1.429893,  -0.826040,
+    0.188670,  -4.499894, 1.013447,  -2.101299, 0.317516,  -3.452141, -0.833776,
+    -1.362144, 1.272437,  -4.449355, 1.613591,  -2.039873, 2.613175,  -6.229640,
+    1.659790,  -1.595520, -0.237462, -2.744997, 0.337841,  0.148981,  -1.703771,
+    -2.388023, 1.276469,  1.058508,  -0.401642, -4.680769, 0.861881,  -1.336381,
+    1.153080,  -2.834378, 0.721075,  0.900115,  1.360511,  -5.573611, 0.949182,
+    -2.970844, 2.017563,  -5.186108, -0.201038, -1.192824, 0.610142,  -4.450919,
+    -0.897114, -1.812093, 0.422310,  -5.245487, 0.256549,  0.320275,  -2.324150,
+    -2.967040, -0.260536, -0.721467, 0.454148,  -5.058031, 0.526370,  -0.895656,
+    0.732240,  -3.327363, 1.353953,  -1.277912, -0.483171, -1.926713, 0.065044,
+    -2.167506, -0.196606, -1.923437, 0.604962,  -2.088319, 1.406834,  -5.227296,
+    2.247351,  -4.421744, 1.729791,  -5.007922, 1.264769,  -0.897019, 0.922902,
+    -3.887108, 2.087432,  -1.310226, -0.101938, -3.359082, -0.079662, -0.514988,
+    -0.963179, -4.038209, 2.223278,  -0.590083, -2.310458, -1.748338, 0.363406,
+    -0.540731, -0.885913, -4.179595, 2.216781,  -3.044339, -0.447100, -2.446098,
+    0.931101,  -1.676190, 2.096175,  -4.980755, 2.262151,  -1.095047, 1.897516,
+    -5.996138, 2.191038,  0.297128,  -0.780974, -2.884299, 1.195408,  -0.521065,
+    -1.955837, -3.091064, -0.404183, -1.961519, 4.076096,  -7.521851, 2.242064,
+    -1.988043, 0.303300,  -2.422585, 0.322230,  -3.377634, 3.499955,  -7.084434,
+    2.375587,  -0.718851, 2.150076,  -5.412241, 2.374280,  -2.006088, 2.229828,
+    -5.848188, 2.543077,  -2.171042, 2.096026,  -5.300007, 0.141405,  -1.187745,
+    0.105340,  -4.003816, 1.034281,  -3.980804, 1.856709,  -5.103042, 0.623737,
+    -2.080307, 0.896140,  -3.104050, 0.983158,  -0.424898, -1.154270, -3.805728,
+    1.978917,  -1.314387, 1.235096,  -3.148906, 1.113173,  0.111713,  2.055213,
+    -7.565283, 2.100342};
+constexpr std::initializer_list<float> biases = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
+constexpr std::initializer_list<float> recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+class BidirectionalRNNOpModel : public SingleOpModel {
+ public:
+  BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
+                          int bw_units, int input_size)
+      : batches_(batches),
+        sequence_len_(sequence_len),
+        fw_units_(fw_units),
+        bw_units_(bw_units),
+        input_size_(input_size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    fw_weights_ = AddInput(TensorType_FLOAT32);
+    fw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    fw_bias_ = AddInput(TensorType_FLOAT32);
+    fw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    fw_output_ = AddOutput(TensorType_FLOAT32);
+    bw_weights_ = AddInput(TensorType_FLOAT32);
+    bw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    bw_bias_ = AddInput(TensorType_FLOAT32);
+    bw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    bw_output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+                 BuiltinOptions_SequenceRNNOptions,
+                 CreateSequenceRNNOptions(builder_, /*time_major=*/false,
+                                          ActivationFunctionType_RELU)
+                     .Union());
+    BuildInterpreter({
+        {batches_, sequence_len_, input_size_},  // input
+        {fw_units_, input_size_},                // fw_weights
+        {fw_units_, fw_units_},                  // fw_recurrent_weights
+        {fw_units_},                             // fw_bias
+        {bw_units_, input_size_},                // bw_weights
+        {bw_units_, bw_units_},                  // bw_recurrent_weights
+        {bw_units_}                              // bw_bias
+    });
+  }
+
+  void SetFwBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_bias_, f);
+  }
+
+  void SetBwBias(std::initializer_list<float> f) {
+    PopulateTensor(bw_bias_, f);
+  }
+
+  void SetFwWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_weights_, f);
+  }
+
+  void SetBwWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_weights_, f);
+  }
+
+  void SetFwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_weights_, f);
+  }
+
+  void SetBwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_recurrent_weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  void ResetHiddenStates() {
+    const int fw_zero_buffer_size = fw_units_ * batches_;
+    std::unique_ptr<float[]> fw_zero_buffer(new float[fw_zero_buffer_size]);
+    memset(fw_zero_buffer.get(), 0, fw_zero_buffer_size * sizeof(float));
+    PopulateTensor(fw_hidden_state_, 0, fw_zero_buffer.get(),
+                   fw_zero_buffer.get() + fw_zero_buffer_size);
+    const int bw_zero_buffer_size = bw_units_ * batches_;
+    std::unique_ptr<float[]> bw_zero_buffer(new float[bw_zero_buffer_size]);
+    memset(bw_zero_buffer.get(), 0, bw_zero_buffer_size * sizeof(float));
+    PopulateTensor(bw_hidden_state_, 0, bw_zero_buffer.get(),
+                   bw_zero_buffer.get() + bw_zero_buffer_size);
+  }
+
+  std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
+  std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
+
+  int input_size() { return input_size_; }
+  int num_fw_units() { return fw_units_; }
+  int num_bw_units() { return bw_units_; }
+  int num_batches() { return batches_; }
+  int sequence_len() { return sequence_len_; }
+
+ private:
+  int input_;
+  int fw_weights_;
+  int fw_recurrent_weights_;
+  int fw_bias_;
+  int fw_hidden_state_;
+  int fw_output_;
+  int bw_weights_;
+  int bw_recurrent_weights_;
+  int bw_bias_;
+  int bw_hidden_state_;
+  int bw_output_;
+
+  int batches_;
+  int sequence_len_;
+  int fw_units_;
+  int bw_units_;
+  int input_size_;
+};
+
+// TODO(mirkov): add another test which directly compares to TF once TOCO
+// supports the conversion from dynamic_rnn with BasicRNNCell.
+TEST(BidirectionalRNNOpTest, BlackBoxTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_fw_start = rnn_golden_fw_output;
+  float* golden_fw_end =
+      golden_fw_start + rnn.num_fw_units() * rnn.sequence_len();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* golden_bw_start = rnn_golden_bw_output;
+  float* golden_bw_end =
+      golden_bw_start + rnn.num_bw_units() * rnn.sequence_len();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Check that if the input sequence is reversed the outputs are the same just
+// forward and backward are swapped (and reversed).
+TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  // Reverse inputs in each batch: in_1, in_2,..., in_k is inserted in the
+  // following order: [in_k,..., in_2, in_1, in_k,...,in_2, in_1].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    const int reverse_idx = rnn.sequence_len() - i - 1;
+    rnn.SetInput(reverse_idx * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((rnn.sequence_len() + reverse_idx) * rnn.input_size(),
+                 batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  // The forward and backward outputs are swapped.
+  std::vector<float> fw_expected;  // consider using std::deque instead.
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.begin(), golden_fw_start, golden_fw_end);
+  }
+  fw_expected.insert(fw_expected.end(), fw_expected.begin(), fw_expected.end());
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  std::vector<float> bw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_bw_start = rnn_golden_fw_output + i * rnn.num_bw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_bw_units();
+    bw_expected.insert(bw_expected.begin(), golden_bw_start, golden_bw_end);
+  }
+  bw_expected.insert(bw_expected.end(), bw_expected.begin(), bw_expected.end());
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Tests an end-to-end neural network with a Bidirectional RNN followed by a
+// DNN that aggregates the outputs from the two sequences.
+TEST(BidirectionalRNNOpTest, EndToEndTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  const int output_size = 4;
+  float dnn_weights[] = {
+      -0.5782342,  -0.052212059, 0.73036242,  -0.81216097, -0.80088139,
+      -0.23420811, -0.39647382,  0.31423986,  0.61819065,  -0.73659575,
+      -0.89698344, -0.8931554,   -0.0845688,  0.5617367,   0.38415289,
+      -0.11487955, -0.7617774,   0.17927337,  0.15726972,  0.059798479,
+      0.19009054,  -0.27616632,  -0.39142907, 0.77744663,  -0.046830714,
+      -0.6603595,  0.21945822,   0.051494241, 0.23785079,  0.19239247,
+      -0.53268754, 0.65961659,   -0.85981959, -0.80232513, 0.84745562,
+      -0.66070104, -0.036533296, -0.54901814, 0.65353882,  -0.41834265,
+      -0.28561389, 0.75655544,   -0.31149811, 0.62981737,  0.31829214,
+      -0.92734522, -0.48506218,  0.55651462,  0.25192821,  0.67220747,
+      -0.3836869,  -0.55798125,  -0.60395885, 0.22488403,  -0.78053463,
+      0.3492105,   0.56452453,   0.4389236,   -0.59929526, -0.19762468,
+      -0.36868393, -0.13198286,  -0.53800809, -0.22850353};
+
+  std::initializer_list<float> dnn_biases = {
+    0.29177809, -0.98799044, 0.065919638, 0.68781924};
+
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  const int output_sequence_size = output_size * rnn.sequence_len();
+  const int num_examples = 64;
+  for (int k = 0; k < num_examples; k++) {
+    float* batch_start = endtoend_input + k * input_sequence_size;
+    float* batch_end = batch_start + input_sequence_size;
+    rnn.SetInput(0, batch_start, batch_end);
+
+    rnn.Invoke();
+
+    std::vector<float> fw_output = rnn.GetFwOutput();
+    std::vector<float> bw_output = rnn.GetBwOutput();
+    EXPECT_EQ(fw_output.size(), bw_output.size());
+
+    std::transform(fw_output.begin(), fw_output.end(), bw_output.begin(),
+                   fw_output.begin(), std::plus<float>());
+
+    std::vector<float> sequence_result;
+    for (int s = 0; s < rnn.sequence_len(); s++) {
+      const float* rnn_output = fw_output.data() + s * rnn.num_fw_units();
+      std::vector<float> results(dnn_biases);
+      for (int i = 0; i < output_size; i++) {
+        for (int j = 0; j < rnn.num_fw_units(); j++) {
+          results[i] += *(rnn_output + j) * dnn_weights[output_size * j + i];
+        }
+      }
+      sequence_result.insert(sequence_result.end(), results.begin(),
+                             results.end());
+    }
+
+    float* golden_start = golden_endtoend_output + k * output_sequence_size;
+    float* golden_end = golden_start + output_sequence_size;
+
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    EXPECT_THAT(sequence_result, ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 37f499a4d09a38765aa4b8db8aa91b708edd7823..a5095e1e646dbe0e359388a207a1479c8bdcb153 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -42,7 +42,7 @@ namespace conv {
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
-  kNeonOptimized,
+  kMultithreadOptimized,
 };
 
 struct OpData {
@@ -290,26 +290,33 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
 
-  if (kernel_type == kReference) {
-    reference_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
-  } else {
-    optimized_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+  switch (kernel_type) {
+    case kReference:
+      reference_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
+    case kGenericOptimized:
+    case kMultithreadOptimized:
+      // There is only one optimized implementation for Quantized Conv.
+      optimized_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
   }
 }
 
@@ -322,31 +329,46 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
 
-  if (kernel_type == kReference) {
-    reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                        GetTensorData<float>(filter), GetTensorDims(filter),
-                        GetTensorData<float>(bias), GetTensorDims(bias),
-                        params->stride_width, params->stride_height,
-                        data->padding.width, data->padding.height,
-                        output_activation_min, output_activation_max,
-                        GetTensorData<float>(output), GetTensorDims(output),
-                        GetTensorData<float>(im2col), GetTensorDims(im2col));
-  } else {
-    const float* filter_data;
-    if (data->need_hwcn_weights) {
-      filter_data = GetTensorData<float>(hwcn_weights);
-    } else {
-      filter_data = GetTensorData<float>(filter);
+  switch (kernel_type) {
+    case kReference: {
+      reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kGenericOptimized: {
+      optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kMultithreadOptimized: {
+      const float* filter_data;
+      if (data->need_hwcn_weights) {
+        filter_data = GetTensorData<float>(hwcn_weights);
+      } else {
+        filter_data = GetTensorData<float>(filter);
+      }
+      multithreaded_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input), filter_data,
+          GetTensorDims(filter), GetTensorData<float>(bias),
+          GetTensorDims(bias), params->stride_width, params->stride_height,
+          data->padding.width, data->padding.height, params->padding,
+          output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
     }
-
-    multithreaded_ops::Conv(
-        GetTensorData<float>(input), GetTensorDims(input), filter_data,
-        GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
-        params->stride_width, params->stride_height, data->padding.width,
-        data->padding.height, params->padding, output_activation_min,
-        output_activation_max, GetTensorData<float>(output),
-        GetTensorDims(output), GetTensorData<float>(im2col),
-        GetTensorDims(im2col));
   }
 }
 
@@ -407,18 +429,14 @@ TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
   return &r;
 }
 
-TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() {
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
-                                 conv::Eval<conv::kNeonOptimized>};
+                                 conv::Eval<conv::kMultithreadOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_CONV_2D() {
-#ifdef USE_NEON
-  return Register_CONVOLUTION_NEON_OPT();
-#else
-  return Register_CONVOLUTION_GENERIC_OPT();
-#endif
+  return Register_CONVOLUTION_MULTITHREADED_OPT();
 }
 
 }  // namespace builtin
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 1d0a81c3135625c07a3566f5f9a8e5401f0d4db7..7550f7cc0db1ed762e8ccfe219a488c463cc1f99 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -15,12 +15,24 @@ limitations under the License.
 #include <cstdarg>
 
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT();
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAreArray;
@@ -30,9 +42,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
   // TODO(ahentz): Also test different activation types, bias, padding types,
   // stride values.
   BaseConvolutionOpModel(
-      const TensorData& input, const TensorData& filter,
-      const TensorData& output, int stride_width = 2, int stride_height = 2,
-      enum Padding padding = Padding_VALID,
+      TfLiteRegistration* registration, const TensorData& input,
+      const TensorData& filter, const TensorData& output, int stride_width = 2,
+      int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
@@ -62,6 +74,8 @@ class BaseConvolutionOpModel : public SingleOpModel {
                                      stride_height, activation)
                      .Union());
 
+    resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
+                                                    registration);
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
   }
 
@@ -83,12 +97,25 @@ class ConvolutionOpModel : public BaseConvolutionOpModel {
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
-
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 };
 
-TEST(ConvolutionOpTest, SimpleTestFloat32) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
+    {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
+    {"MultithreadedOptimized",
+     ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
+});
+
+class ConvolutionOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(ConvolutionOpTest, SimpleTestFloat32) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}});
 
@@ -117,8 +144,8 @@ TEST(ConvolutionOpTest, SimpleTestFloat32) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 6, 1}},
+TEST_P(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 6, 1}},
                        {TensorType_FLOAT32, {1, 2, 2, 1}},
                        {TensorType_FLOAT32, {}},
                        /*stride_width=*/3, /*stride_height=*/1);
@@ -139,7 +166,7 @@ TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
                              }));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -150,6 +177,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -192,7 +220,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
                                                178, 187, 234, 261, 121}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -203,6 +231,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -245,7 +274,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
                                                367, 188, 197, 244, 271, 131}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -256,6 +285,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -300,7 +330,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
               ElementsAreArray({0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -311,6 +341,7 @@ TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_VALID;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -366,8 +397,9 @@ class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
 
 // In this tests we set the input and output scales so that the results
 // match exactly the 'non-quantized' version.
-TEST(ConvolutionOpTest, SimpleTestQuantized) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantized) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
                                 {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128});
   m.SetInput({
@@ -405,8 +437,9 @@ TEST(ConvolutionOpTest, SimpleTestQuantized) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
                                 {TensorType_UINT8, {1, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128},
                                 /*stride_width=*/3, /*stride_height=*/1);
@@ -430,6 +463,11 @@ TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
                                  167, 93,   //
                              }));
 }
+
+INSTANTIATE_TEST_CASE_P(
+    ConvolutionOpTest, ConvolutionOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
index dcdc5fffad9ceac1a9d23a4e91637a9ff92a8dda..ef2b5422253ea880a9ded4d3c0efc5cec07178a9 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
@@ -123,18 +123,16 @@ TEST(EmbeddingLookupOpTest, SimpleTestSqrtn) {
       [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
   m.Invoke();
 
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear({
-          1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
-          0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
-          6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
-          6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
-          7.20f / std::sqrt(20.0f),
-          7.26f /
-              std::sqrt(
-                  20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 * Row 3 + 4 * Row 0
-      })));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
+                  6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
+                  6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
+                  7.20f / std::sqrt(20.0f),
+                  7.26f / std::sqrt(20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 *
+                                             // Row 3 + 4 * Row 0
+              })));
 }
 
 TEST(EmbeddingLookupOpTest, Indices3DTest) {
diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc
index 658d977b8dc7fffcdde69d74ba2564dfa1b5709e..cdadbeda1884ba0186846826dd16be6ff69878d9 100644
--- a/tensorflow/contrib/lite/kernels/gather_test.cc
+++ b/tensorflow/contrib/lite/kernels/gather_test.cc
@@ -81,10 +81,8 @@ TEST(GatherOpTest, Test0DIndex) {
   m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
   m.SetPositions({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
-  EXPECT_THAT(m.GetOutputShape(),
-              ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(GatherOpTest, Test0DIndexWith0DResult) {
@@ -94,8 +92,7 @@ TEST(GatherOpTest, Test0DIndexWith0DResult) {
   m.SetInputFloat({1.0, 2.0, 3.0});
   m.SetPositions({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0})));
   EXPECT_TRUE(m.GetOutputShape().empty());
 }
 
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
index cb6038f9009a3865661e7b4f075c3033166d0f91..ba0ed5ce06392613238b757308dddc2b22e7eb30 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
@@ -116,7 +116,10 @@ TEST(HashtableLookupOpTest, Test2DInput) {
                                  1.0, 1.1,  // 1-st item
                              })));
   EXPECT_THAT(m.GetHit(), ElementsAreArray({
-                              1, 0, 1, 1,
+                              1,
+                              0,
+                              1,
+                              1,
                           }));
 }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 21118fc96d804654a33d5c693d496b05e2dc59d2..38b032c6de7987ff5e3da3ba5fcf4e9fc8574c44 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -267,6 +267,8 @@ cc_library(
         "optimized/neon_tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "optimized/cpu_check.h",
         "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
     ],
@@ -274,8 +276,11 @@ cc_library(
     deps = [
         ":cpu_check",
         ":portable_tensor_utils",
+        ":types",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp//:gemmlowp",
     ],
 )
 
@@ -285,14 +290,21 @@ cc_library(
         "tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "compatibility.h",
+        "optimized/cpu_check.h",
+        "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
         "reference/portable_tensor_utils.h",
         "tensor_utils.h",
+        "types.h",
     ],
     copts = NEON_FLAGS_IF_APPLICABLE,
     deps = [
         "//tensorflow/contrib/lite/kernels:activation_functor",
         "//tensorflow/contrib/lite:builtin_op_data",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp//:gemmlowp",
     ] + select({
         ":arm": [
             ":neon_tensor_utils",
@@ -312,6 +324,15 @@ cc_library(
         ":ios_arm64": [
             ":neon_tensor_utils",
         ],
+        ":x86_64": [
+            ":neon_tensor_utils",
+        ],
+        ":x86": [
+            ":neon_tensor_utils",
+        ],
+        ":darwin": [
+            ":neon_tensor_utils",
+        ],
         "//conditions:default": [
             ":portable_tensor_utils",
         ],
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 1d963afb7e1ce414f251f090208923ca0c68cee1..51426bb1c584b82af7b1a2ffaf5a675a1dd9a6fd 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -27,6 +27,10 @@ limitations under the License.
 #define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
 #endif
 
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false)
+#endif
+
 #ifndef TFLITE_DCHECK_GE
 #define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
 #endif
@@ -52,6 +56,10 @@ limitations under the License.
 #define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort()
 #endif
 
+#ifndef TFLITE_CHECK_NE
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort()
+#endif
+
 #ifndef TFLITE_CHECK_GE
 #define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort()
 #endif
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
index dea46cc12065ed34cf681916a46a55bd7a86f463..e0eca2e736be00ff09737325f06b0035e77e3103 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -34,17 +34,13 @@ inline bool TestCPUFeatureNeon() {
 #endif  // __aarch64__
 }
 
-#elif __ARM_NEON
+#elif defined USE_NEON || defined __ARM_NEON 
 
-inline bool TestCPUFeatureNeon() {
-  return true;
-}
+inline bool TestCPUFeatureNeon() { return true; }
 
 #else
 
-inline bool TestCPUFeatureNeon() {
-  return false;
-}
+inline bool TestCPUFeatureNeon() { return false; }
 
 #endif
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
index 81796e295d9c7ae1f04163467c8b2af851b632c2..e2c87df80bd927d823b150ed3799641796dfb4c7 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -992,11 +992,11 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
           for (int k = 0; k < 4; k++) {
             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
           }
-            for (int k = 0; k < 4; k++) {
-              acc[k] = vmaxq_f32(
-                  vdupq_n_f32(output_activation_min),
-                  vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
-            }
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vmaxq_f32(
+                vdupq_n_f32(output_activation_min),
+                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
           for (int k = 0; k < 4; k++) {
             vst1q_f32(output_ptr + 4 * k, acc[k]);
           }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index f993fd6a00f054c670b247e886a1d9d2a34643e7..fc5897896477711c46b06f10003acb10783d12af 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1504,7 +1504,7 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(
       << "*\n"
       << "* If you would like to carry on with the slow code, compile\n"
       << "* with this preprocessor token defined:\n"
-      << "* TFLITE_ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
+      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
       << "*\n"
       << "* The right thing to do, if you care about performance, is to add\n"
       << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index f21fbf532ac01ced594715d0a0da9bd6e6f8d0e2..ce3cde76999c77e1f9bf1eaccdba7e84ed508dda 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #endif
 
-
 namespace Eigen {
 
 /** SpatialConvolution
@@ -215,13 +214,12 @@ EIGEN_DEVICE_FUNC
   }
   // TODO(yangke): choose() is defined in TensorContraction.h -- consider
   // moving it to somewhere more "common".
-  return
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
-          .reshape(post_contract_dims);
+  return input
+      .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                             row_in_stride, col_in_stride, padding_type)
+      .reshape(pre_contract_dims)
+      .contract(kernel.reshape(kernel_dims), contract_dims)
+      .reshape(post_contract_dims);
 }
 
 }  // end namespace Eigen
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index bf0bdfb1fb875c4b54c55e25d4a17541507ecd4c..ea8502ae33bc2ee5965e3be30a7d9ee36604abe3 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -15,12 +15,12 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
 
 #ifdef USE_NEON
 
-#include <arm_neon.h>
 #define kFloatWeightsPerNeonLane 4
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index c35b9da938e20ca4b32eb12c9b6863dc338c889c..8163c76cfd2eb9b320fe65e54c6b88f3d694a598 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1538,9 +1538,10 @@ void Add(const int32* input1_data, const Dims<4>& input1_dims,
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
-template <FusedActivationFunctionType Ac, typename T>
+template <typename T>
 void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
                   const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
                   T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
 
@@ -1563,15 +1564,30 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
@@ -1772,9 +1788,10 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims,
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
-template <FusedActivationFunctionType Ac, typename T>
+template <typename T>
 void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
                   const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
                   T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul");
 
@@ -1797,15 +1814,30 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 64651d83486105647b3fc3b9f1a3d33f55830601..4bcf4993e92136388b3febe7e04030943f4b54f3 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -889,10 +889,11 @@ inline void Add(int left_shift, const uint8* input1_data,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <FusedActivationFunctionType Ac>
-void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
-                  const float* input2_data, const Dims<4>& input2_dims,
-                  float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
 
   NdArrayDesc<4> desc1;
@@ -914,15 +915,30 @@ void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
@@ -1053,10 +1069,11 @@ void Mul(const float* input1_data, const Dims<4>& input1_dims,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <FusedActivationFunctionType Ac>
-void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
-                  const float* input2_data, const Dims<4>& input2_dims,
-                  float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul");
 
   NdArrayDesc<4> desc1;
@@ -1078,15 +1095,30 @@ void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
@@ -2338,13 +2370,15 @@ inline int StartIndex(int start, int stride, int dim, bool masked) {
   return masked ? (stride > 0 ? 0 : dim - 1) : start;
 }
 
-inline int StopIndex(int stop, int stride, int dim, bool masked) {
-  return masked ? (stride > 0 ? dim : -1) : stop;
+inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
+                     bool shrink_axis_masked) {
+  return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
+                            : masked ? (stride > 0 ? dim : -1) : stop;
 }
 
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask,
+                         int begin_mask, int end_mask, int shrink_axis_mask,
                          const std::vector<int>& starts,
                          const std::vector<int>& stops,
                          const std::vector<int>& strides, T* output_data,
@@ -2355,19 +2389,23 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   const int start_b =
       StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
   const int stop_b =
-      StopIndex(stops[3], strides[3], input_dims.sizes[3], end_mask & 8);
+      StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
+                end_mask & 8, shrink_axis_mask & 8);
   const int start_h =
       StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
   const int stop_h =
-      StopIndex(stops[2], strides[2], input_dims.sizes[2], end_mask & 4);
+      StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
+                end_mask & 4, shrink_axis_mask & 4);
   const int start_w =
       StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
   const int stop_w =
-      StopIndex(stops[1], strides[1], input_dims.sizes[1], end_mask & 2);
+      StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
+                end_mask & 2, shrink_axis_mask & 2);
   const int start_d =
       StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
   const int stop_d =
-      StopIndex(stops[0], strides[0], input_dims.sizes[0], end_mask & 1);
+      StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
+                end_mask & 1, shrink_axis_mask & 1);
 
   T* out_ptr = output_data;
   for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
@@ -2385,6 +2423,18 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& starts,
+                         const std::vector<int>& stops,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  StridedSlice(input_data, input_dims, begin_mask, end_mask,
+               /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
+               output_dims);
+}
+
 template <typename T>
 inline void Slice(const T* input_data, const Dims<4>& input_dims,
                   const std::vector<int>& begin, const std::vector<int>& size,
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
index 904a97803a6a9ba369c1e64c711b12d19ffc10c4..f4181b18a8f46fd9bef4b81a210a6b8134a4e9d0 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 
 #ifndef USE_NEON
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index b0546c00cf977af5f722a802866448b0cb293b8d..955e8c5764c6adad37a0009f4ddf8accb437b174 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
 #include <algorithm>
 #include <cmath>
+#include <memory>
+
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 
 namespace tflite {
@@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
   }
 }
 
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
+  return TfLiteIntArrayEqual(input1->dims, input2->dims);
+}
+
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape) {
+  int64_t dims1 = NumDimensions(input1);
+  int64_t dims2 = NumDimensions(input2);
+  int64_t out_dims = std::max(dims1, dims2);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
+  for (int i = 0; i < out_dims; ++i) {
+    int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
+    int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
+    TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
+    shape->data[out_dims - i - 1] = std::max(d1, d2);
+  }
+  *output_shape = shape.release();
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 1cf30ecff9760d218d279cc6c7132589e11cc15c..3cfa72615a95d6f215ef9d35f2572026ec90fad8 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
+inline int64_t NumElements(const TfLiteTensor* t) {
+  int64_t count = 1;
+  for (int i = 0; i < NumDimensions(t); ++i) {
+    count *= SizeOfDimension(t, i);
+  }
+  return count;
+}
+
 inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                             const TfLiteNode* node, int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@@ -44,6 +52,22 @@ inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
   return nullptr;
 }
 
+// Determines whether tensor is constant.
+inline bool IsConstantTensor(TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteMmapRo;
+}
+
+// Determines whether tensor is dynamic. Note that a tensor can be non-const and
+// not dynamic. This function specificially checks for a dynamic tensor.
+inline bool IsDynamicTensor(TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteDynamic;
+}
+
+// Sets tensor to dynamic.
+inline void SetTensorToDynamic(TfLiteTensor* tensor) {
+  tensor->allocation_type = kTfLiteDynamic;
+}
+
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
@@ -60,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                    float* activation_min,
                                    float* activation_max);
 
+// Return true if the given tensors have the same shape.
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
+
+// Calculate the output_shape that is necessary for element-wise operations
+// with broadcasting involving the two input tensors.
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c65b68970f6853e17af3a70aad7a2bc982a1ee60
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+void ReportError(TfLiteContext* context, const char* format, ...) {}
+
+class KernelUtilTest : public ::testing::Test {
+ public:
+  KernelUtilTest() {
+    context_.ReportError = ReportError;
+
+    tensor1_.dims = nullptr;
+    tensor2_.dims = nullptr;
+    tensor1_.allocation_type = kTfLiteMmapRo;
+    tensor2_.allocation_type = kTfLiteMmapRo;
+  }
+  ~KernelUtilTest() {
+    TfLiteTensorFree(&tensor1_);
+    TfLiteTensorFree(&tensor2_);
+  }
+
+  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
+    TfLiteTensorFree(tensor);
+    tensor->dims = TfLiteIntArrayCreate(dims.size());
+    int i = 0;
+    for (int d : dims) {
+      tensor->dims->data[i] = d;
+      ++i;
+    }
+  }
+
+  std::vector<int> GetShape(TfLiteIntArray* dims) {
+    std::vector<int> result;
+    for (int i = 0; i < dims->size; ++i) {
+      result.push_back(dims->data[i]);
+    }
+    return result;
+  }
+
+ protected:
+  TfLiteContext context_;
+  TfLiteTensor tensor1_;
+  TfLiteTensor tensor2_;
+};
+
+TEST_F(KernelUtilTest, SameShapeEmpty) {
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {1, 2, 3});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3, 4});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_EQ(output, nullptr);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeOnes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 1});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeScalars) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {});
+  SetShape(&tensor2_, {2});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {3, 1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2, 3, 4});
+  SetShape(&tensor2_, {1, 3, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
+  TfLiteIntArrayFree(output);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index 540e5a364dd60a42c316199d0ebe878ae07e6756..ec1c40202761e3789462a4740e5547eba654b0f9 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -35,10 +35,12 @@ struct MeanContext {
   MeanContext(TfLiteContext* context, TfLiteNode* node) {
     params = reinterpret_cast<TfLiteMeanParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    axis = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
   TfLiteMeanParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* axis;
   TfLiteTensor* output;
 };
 
@@ -54,45 +56,26 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<int*>(buffer);
 }
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  MeanContext op_context(context, node);
-  int input_num_dims = NumDimensions(op_context.input);
-  int axis_num_dims = op_context.params->num_axis_dimensions;
-
-  // Creates a temp index to iterate through input data.
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
-  TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(2);
-  node->temporaries->data[0] = *scratch_tensor_index;
-  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
-  scratch_tensor->type = kTfLiteInt32;
-  scratch_tensor->allocation_type = kTfLiteArenaRw;
-  TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
-  index_size->data[0] = input_num_dims;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, scratch_tensor, index_size));
-
-  // Creates a temp tensor to store resolved axis given input data.
-  node->temporaries->data[1] = *scratch_tensor_index + 1;
-  TfLiteTensor* axis_tensor = &context->tensors[node->temporaries->data[1]];
-  axis_tensor->type = kTfLiteInt32;
-  axis_tensor->allocation_type = kTfLiteArenaRw;
+// Resizes the temp tensor that stores resolved axis.
+TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context,
+                            TfLiteTensor* resolved_axis) {
   TfLiteIntArray* axis_size = TfLiteIntArrayCreate(1);
-  axis_size->data[0] = op_context.params->num_axis_dimensions;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, axis_tensor, axis_size));
+  axis_size->data[0] = static_cast<int>(NumElements(op_context->axis));
+  return context->ResizeTensor(context, resolved_axis, axis_size);
+}
 
-  // Determines size of output tensor.
-  const TfLiteIntArray* input_dims = op_context.input->dims;
-  const int* axis = op_context.params->axis;
-  if (op_context.params->keep_dims) {
+// Resizes output array based on the input size and resolved axis.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                MeanContext* op_context) {
+  size_t num_axis = NumElements(op_context->axis);
+  const TfLiteIntArray* input_dims = op_context->input->dims;
+  int input_num_dims = NumDimensions(op_context->input);
+  const int* axis = GetTensorData<int>(op_context->axis);
+  if (op_context->params->keep_dims) {
     TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims);
     for (int idx = 0; idx < input_num_dims; ++idx) {
       bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
         if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
           is_axis = true;
           break;
@@ -104,11 +87,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         output_dims->data[idx] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context.output, output_dims);
+    return context->ResizeTensor(context, op_context->output, output_dims);
   } else {
     // Calculates size of reducing axis.
-    int num_reduce_axis = axis_num_dims;
-    for (int i = 0; i < axis_num_dims; ++i) {
+    int num_reduce_axis = num_axis;
+    for (int i = 0; i < num_axis; ++i) {
       int current = axis[i];
       if (current < 0) {
         current += input_num_dims;
@@ -131,7 +114,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     int num_skip_axis = 0;
     for (int idx = 0; idx < input_num_dims; ++idx) {
       bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
         if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
           ++num_skip_axis;
           is_axis = true;
@@ -142,24 +125,76 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context.output, output_dims);
+    return context->ResizeTensor(context, op_context->output, output_dims);
+  }
+}
+
+// Initializes temp tensors to store index and resolved axis.
+TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
+                                   MeanContext* op_context) {
+  // Creates a temp index to iterate through input data.
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries->data[0] = *scratch_tensor_index;
+  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  scratch_tensor->type = kTfLiteInt32;
+  scratch_tensor->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
+  index_size->data[0] = NumDimensions(op_context->input);
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, scratch_tensor, index_size));
+
+  // Creates a temp tensor to store resolved axis given input data.
+  node->temporaries->data[1] = *scratch_tensor_index + 1;
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  resolved_axis->type = kTfLiteInt32;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MeanContext op_context(context, node);
+  TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Leaves work to Eval if axis is not constant; else resizes output.
+  if (!IsConstantTensor(op_context.axis)) {
+    SetTensorToDynamic(op_context.output);
+    SetTensorToDynamic(resolved_axis);
+    return kTfLiteOk;
   }
+  resolved_axis->allocation_type = kTfLiteArenaRw;
+  TF_LITE_ENSURE_OK(context,
+                    ResizeTempAxis(context, &op_context, resolved_axis));
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   MeanContext op_context(context, node);
+  int num_axis = static_cast<int>(NumElements(op_context.axis));
   TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempAxis(context, &op_context, resolved_axis));
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(resolved_axis->bytes, resolved_axis);
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-#define TF_LITE_MEAN(kernel_type, data_type)                           \
-  kernel_type::Mean<>(                                                 \
-      GetTensorData<data_type>(op_context.input),                      \
-      op_context.input->dims->data, op_context.input->dims->size,      \
-      GetTensorData<data_type>(op_context.output),                     \
-      op_context.output->dims->data, op_context.output->dims->size,    \
-      op_context.params->axis, op_context.params->num_axis_dimensions, \
-      op_context.params->keep_dims, GetTensorData<int>(temp_index),    \
+#define TF_LITE_MEAN(kernel_type, data_type)                        \
+  kernel_type::Mean<>(                                              \
+      GetTensorData<data_type>(op_context.input),                   \
+      op_context.input->dims->data, op_context.input->dims->size,   \
+      GetTensorData<data_type>(op_context.output),                  \
+      op_context.output->dims->data, op_context.output->dims->size, \
+      GetTensorData<int>(op_context.axis), num_axis,                \
+      op_context.params->keep_dims, GetTensorData<int>(temp_index), \
       GetTensorData<int>(resolved_axis))
 
   if (kernel_type == kReference) {
diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc
index 4305c0632f5a52b858a056109187ad4a0cc2e46e..c4c53c2ded351849e7c458fc754c36395a25ebd0 100644
--- a/tensorflow/contrib/lite/kernels/mean_test.cc
+++ b/tensorflow/contrib/lite/kernels/mean_test.cc
@@ -25,58 +25,108 @@ using ::testing::ElementsAreArray;
 
 class BaseMeanOpModel : public SingleOpModel {
  public:
-  BaseMeanOpModel(const TensorData& input, const TensorData& output,
-                  std::initializer_list<int> axis, bool keep_dims) {
-    input_ = AddInput(input);
-    output_ = AddOutput(output);
-    SetBuiltinOp(
-        BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
-        CreateMeanOptions(builder_, builder_.CreateVector<int>(axis), keep_dims)
-            .Union());
-    BuildInterpreter({GetShape(input_)});
+  void SetAxis(std::initializer_list<int> data) { PopulateTensor(axis_, data); }
+
+  template <class T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
   }
 
-  int input() { return input_; }
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  protected:
   int input_;
+  int axis_;
   int output_;
 };
 
-class FloatMeanOpModel : public BaseMeanOpModel {
+// Model for the tests case where axis is a const tensor.
+class MeanOpConstModel : public BaseMeanOpModel {
  public:
-  using BaseMeanOpModel::BaseMeanOpModel;
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
+  MeanOpConstModel(const TensorData& input, const TensorData& output,
+                   std::initializer_list<int> axis_shape,
+                   std::initializer_list<int> axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
   }
+};
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+// Model for the tests case where axis is a dynamic tensor.
+class MeanOpDynamicModel : public BaseMeanOpModel {
+ public:
+  MeanOpDynamicModel(const TensorData& input, const TensorData& output,
+                     const TensorData& axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
 };
 
-TEST(FloatMeanOpTest, NotKeepDims) {
+TEST(ConstMeanOpTest, NotKeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
+                     {4}, {1, 0, -3, -3}, false);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
+}
+
+TEST(ConstMeanOpTest, KeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
+                     {2}, {0, 2}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
+}
+
+TEST(DynamicMeanOpTest, NotKeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-  FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
-                     {1, 0, -3, -3}, false);
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}},
+                       false);
+  std::initializer_list<int> axis = {1, 0, -3, -3};
+  m.SetAxis(axis);
   m.SetInput(data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13})));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
 }
 
-TEST(FloatMeanOpTest, KeepDims) {
+TEST(DynamicMeanOpTest, KeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-  FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
-                     {0, 2}, true);
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}},
+                       true);
+  std::initializer_list<int> axis = {0, 2};
+  m.SetAxis(axis);
   m.SetInput(data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 81c73f2523186c2d4072d56bdc8980fcdbb588a3..54575019de4c678ce25561cf2ac8dc80c9973363 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, TfLiteTensor* input1,
-               TfLiteTensor* input2, TfLiteTensor* output) {
+               TfLiteMulParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_MUL(type)                                        \
-  type::Mul(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_MUL(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(reference_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(reference_ops, Mul);
+    }
   } else {
-    TF_LITE_MUL(optimized_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(optimized_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(optimized_ops, Mul);
+    }
   }
 #undef TF_LITE_MUL
 }
 
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, TfLiteTensor* input1,
-                   TfLiteTensor* input2, TfLiteTensor* output) {
+                   TfLiteMulParams* params, const OpData* data,
+                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -98,17 +127,19 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_MUL(type)                                                    \
-  type::BroadcastMul(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
-                     input1_offset, GetTensorData<uint8_t>(input2),          \
-                     GetTensorDims(input2), input2_offset, output_offset,    \
-                     output_multiplier, output_shift, output_activation_min, \
-                     output_activation_max, GetTensorData<uint8_t>(output),  \
-                     GetTensorDims(output));
+#define TF_LITE_MUL(type, opname)                                      \
+  type::opname(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
+               input1_offset, GetTensorData<uint8_t>(input2),          \
+               GetTensorDims(input2), input2_offset, output_offset,    \
+               output_multiplier, output_shift, output_activation_min, \
+               output_activation_max, GetTensorData<uint8_t>(output),  \
+               GetTensorDims(output));
+  // The quantized version of Mul doesn't support activations, so we
+  // always use BroadcastMul.
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    TF_LITE_MUL(reference_ops, BroadcastMul);
   } else {
-    TF_LITE_MUL(optimized_ops);
+    TF_LITE_MUL(optimized_ops, BroadcastMul);
   }
 #undef TF_LITE_MUL
 }
@@ -116,15 +147,17 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalQuantized<kernel_type>(context, node, params, input1, input2, output);
+    EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
+                               output);
   } else {
     context->ReportError(context,
                          "Mul only supports FLOAT32 and quantized UINT8 now.");
@@ -137,19 +170,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration* Register_MUL_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc
index 8838b300c0af167bf2ffcf944fc7c31d6173f462..f1a30f82634631ba8320421d5b36ffe446f443fa 100644
--- a/tensorflow/contrib/lite/kernels/mul_test.cc
+++ b/tensorflow/contrib/lite/kernels/mul_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseMulOpModel : public SingleOpModel {
  public:
-  BaseMulOpModel(TensorData input, TensorData output,
+  BaseMulOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions,
                  CreateMulOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ class QuantizedMulOpModel : public BaseMulOpModel {
 
 TEST(FloatMulOpTest, NoActivation) {
   FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -79,9 +81,9 @@ TEST(FloatMulOpTest, NoActivation) {
 }
 
 TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) {
-  FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}},
-                    ActivationFunctionType_RELU_N1_TO_1);
+  FloatMulOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 5});
   m.Invoke();
@@ -94,6 +96,7 @@ TEST(FloatMulOpTest, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -105,8 +108,26 @@ TEST(FloatMulOpTest, VariousInputShapes) {
   }
 }
 
+TEST(FloatMulOpTest, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-0.2, 0.02, 0.07, 0.08, 0.11, 0.2})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedMulOpTest, NoActivation) {
   QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                         {TensorType_UINT8, {}, -1.0, 1.0},
                         ActivationFunctionType_NONE);
   m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
@@ -117,6 +138,32 @@ TEST(QuantizedMulOpTest, NoActivation) {
                                               kQuantizedTolerance)));
 }
 
+// for quantized Mul, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(QuantizedMulOpTest, WithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
index 17166715ca30ff3d8ba3d384110e403f8910e39d..cee3ec6197c698a11004d42dccdfe2bcca088015 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -243,7 +243,6 @@ class LSTMOpModel : public SingleOpModel {
   int n_output_;
 };
 
-
 TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   const int n_batch = 1;
   const int n_input = 2;
@@ -282,7 +281,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-
   lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
                               0.04717243, 0.48944736, -0.38535351,
                               -0.17212132});
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 569bf0fe8fc9964a1299911d248d53862c99cbdf..48114e5a4069abf864a996141c7b0906301d9809 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -51,17 +51,14 @@ struct PadContext {
 // paddings data is present.
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 PadContext* op_context) {
-  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
-  TF_LITE_ENSURE_EQ(context, op_context->dims, 4);
-
   // Ensures the paddings array is dims x 2.
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 0),
                     op_context->dims);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 1), 2);
 
   // Determines the size of the output tensor.
-  const TfLiteIntArray* input_size = op_context->input->dims;
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(op_context->dims);
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
 
   for (int idx = 0; idx < op_context->dims; ++idx) {
@@ -85,11 +82,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
-  // TODO(nupurgarg): Create wrapper functions for dynamic tensor logic.
+  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, op_context.dims, 4);
+
   // Exit early if paddings is a non-const tensor. Set output tensor to
   // dynamic so output size can be determined in Eval.
-  if (op_context.paddings->allocation_type != kTfLiteMmapRo) {
-    op_context.output->allocation_type = kTfLiteDynamic;
+  if (!IsConstantTensor(op_context.paddings)) {
+    SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
   return ResizeOutputTensor(context, &op_context);
@@ -100,7 +99,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
 
   // Resize the output tensor if the output tensor is dynamic.
-  if (op_context.output->allocation_type == kTfLiteDynamic) {
+  if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
     TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
@@ -178,9 +177,7 @@ TfLiteRegistration* Register_PAD_GENERIC_OPT() {
   return &r;
 }
 
-TfLiteRegistration* Register_PAD() {
-  return Register_PAD_GENERIC_OPT();
-}
+TfLiteRegistration* Register_PAD() { return Register_PAD_GENERIC_OPT(); }
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index f605deaa5b4a3a8572c4be16cb1d301dbc49e5ba..1fb779fd5174a255b7d34322c57dc084f68d8c3f 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -31,6 +31,7 @@ TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
 TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
@@ -73,6 +74,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+             Register_BIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
              Register_UNIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index 2e22d0db56a233bf554c57cf86275832ce941a18..e2e1873f770fad889137b43d87585602162819f7 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -33,17 +33,16 @@ enum KernelType {
   kGenericOptimized,
 };
 
-// Inputs specified in the 2nd tensor (block_shape) and 3rd tensor (paddings)
-// are ignored. Only use the `block_shape` and `paddings` specified in params.
-// TODO(nupurgarg): Support inputs as tensors in SpaceToBatchND.
 struct SpaceToBatchNDContext {
   SpaceToBatchNDContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteSpaceToBatchNDParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    paddings = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteSpaceToBatchNDParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* paddings;
   TfLiteTensor* output;
 };
 
@@ -51,32 +50,29 @@ struct SpaceToBatchNDContext {
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(nupurgarg): Support arbitrary dimension in SpaceToBatchND.
 const int kInputDimensionNum = 4;
-const int kOutputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
 const int kSpatialDimensionNum = 2;
-const int kPaddingDimensionNum = 4;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                SpaceToBatchNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int32* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
 
-  SpaceToBatchNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions,
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->paddings),
                     kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  const int* block_shape = op_context.params->block_shape;
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
 
   // Ensures the input height and width (with padding) is a multiple of block
   // shape height and width.
   for (int dim = 0; dim < kSpatialDimensionNum; ++dim) {
-    int final_dim_size =
-        (input_size->data[dim + 1] + op_context.params->before_paddings[dim] +
-         op_context.params->after_paddings[dim]);
+    int final_dim_size = (input_size->data[dim + 1] + paddings_data[dim * 2] +
+                          paddings_data[dim * 2 + 1]);
     TF_LITE_ENSURE_EQ(context, final_dim_size % block_shape[dim], 0);
     output_size->data[dim + 1] = final_dim_size / block_shape[dim];
   }
@@ -88,33 +84,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_size->data[0] = output_batch_size;
   output_size->data[3] = output_channel_size;
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  SpaceToBatchNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.paddings)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   SpaceToBatchNDContext op_context(context, node);
 
-  int block_shape_dims_array[1] = {kSpatialDimensionNum};
-  Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1);
-
-  // Initialize padding array in the format accepted by the kernel code.
-  // TODO(nupurgarg): Make kernel code accept padding array format that is
-  // consistent with Pad operation (i.e. before_paddings and after_paddings).
-  TfLiteIntArray* padding_data = TfLiteIntArrayCreate(kPaddingDimensionNum);
-  padding_data->data[0] = op_context.params->before_paddings[0];
-  padding_data->data[1] = op_context.params->after_paddings[0];
-  padding_data->data[2] = op_context.params->before_paddings[1];
-  padding_data->data[3] = op_context.params->after_paddings[1];
-  int padding_dims_array[1] = {kPaddingDimensionNum};
-  Dims<4> padding_dims = GetTensorDims(padding_dims_array, 1);
-
-#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                          \
-  type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),          \
-                       GetTensorDims(op_context.input),                  \
-                       op_context.params->block_shape, block_shape_dims, \
-                       padding_data->data, padding_dims,                 \
-                       GetTensorData<scalar>(op_context.output),         \
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
+#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                        \
+  type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<int32_t>(op_context.paddings),    \
+                       GetTensorDims(op_context.paddings),             \
+                       GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
@@ -151,8 +158,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_BATCH_ND
-
-  TfLiteIntArrayFree(padding_data);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
index 45a6aef73d05b57a7f9a7fc6f58c3971c6e03118..92a4a037d5873e608ee7bdbdfc5eaa5e9b62bc8c 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
@@ -26,41 +26,81 @@ using ::testing::ElementsAreArray;
 
 class SpaceToBatchNDOpModel : public SingleOpModel {
  public:
-  SpaceToBatchNDOpModel(std::initializer_list<int> input_shape,
-                        std::initializer_list<int> block_shape,
-                        std::initializer_list<int> before_paddings,
-                        std::initializer_list<int> after_paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
-                 BuiltinOptions_SpaceToBatchNDOptions,
-                 CreateSpaceToBatchNDOptions(
-                     builder_, builder_.CreateVector<int>(block_shape),
-                     builder_.CreateVector<int>(before_paddings),
-                     builder_.CreateVector<int>(after_paddings))
-                     .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetPaddings(std::initializer_list<int> data) {
+    PopulateTensor<int>(paddings_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int block_shape_;
+  int paddings_;
   int output_;
 };
 
+// Tests case where block_shape and paddings are const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, block_shape, paddings);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and paddings are non-const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(paddings);
+//    m.Invoke();
+class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    paddings_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(SpaceToBatchNDOpModel({1, 3, 3, 1}, {2, 2}, {0, 0}, {0, 0}),
+  EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
 }
 
-TEST(SpaceToBatchNDOpTest, SimpleTest) {
-  SpaceToBatchNDOpModel m({1, 4, 4, 1}, {2, 2}, {0, 0}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
@@ -68,17 +108,39 @@ TEST(SpaceToBatchNDOpTest, SimpleTest) {
                                                13, 15, 6, 8, 14, 16}));
 }
 
-TEST(SpaceToBatchNDOpTest, MultipleInputBatches) {
-  SpaceToBatchNDOpModel m({2, 2, 4, 1}, {2, 2}, {0, 0}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
+  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
                                                13, 15, 6, 8, 14, 16}));
 }
 
-TEST(SpaceToBatchNDOpTest, SimplePadding) {
-  SpaceToBatchNDOpModel m({1, 5, 2, 1}, {3, 2}, {1, 2}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
@@ -88,9 +150,36 @@ TEST(SpaceToBatchNDOpTest, SimplePadding) {
                              }));
 }
 
-TEST(SpaceToBatchNDOpTest, ComplexPadding) {
-  SpaceToBatchNDOpModel m({1, 4, 2, 1}, {3, 2}, {1, 2}, {1, 4});
+TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7,
+                                 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
+                                 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 1, 2, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index 91ba4a9b7851c35a5138f4ccea307c810a4731a1..c4ffdf79d3aa7d47b9747bdf4208f8317d9fd22e 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -57,65 +57,6 @@ struct StridedSliceContext {
   int dims;
 };
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  StridedSliceContext op_context(context, node);
-
-  // Ensure validity of input tensor and its dimension
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  // Only INT32 begin/end/strides are supported
-  // TODO(soroosh) add support for INT64
-  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
-  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
-                     "StridedSlice op only supports 1D-4D input arrays.");
-
-  // TODO(soroosh): add the following missing functionalities
-  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
-                     "ellipsis_mask is not implemented yet.");
-  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
-                     "new_axis_mask is not implemented yet.");
-  TF_LITE_ENSURE_MSG(context, op_context.params->shrink_axis_mask == 0,
-                     "shrink_axis_mask is not implemented yet.");
-
-  // TODO(soroosh): optimize for constant tensors to do allocation in Prepare
-  op_context.output->allocation_type = kTfLiteDynamic;
-  return kTfLiteOk;
-}  // namespace strided_slice
-
-// TODO(soroosh): consolidate with BytesRequired in interpreter.h
-TfLiteStatus BytesRequired(TfLiteContext* context, TfLiteType type,
-                           const int* dims, int dims_size, size_t* bytes) {
-  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
-  // MultiplyWithoutOverflow.
-  TF_LITE_ENSURE(context, bytes != nullptr);
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    default:
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
 // Reverse order of bits in the mask to match the expected order in kernel
 inline int ReverseMaskBits(int mask, int num_dimensions) {
   int out = 0;
@@ -146,40 +87,111 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
                           std::min(std::max(index, -dim), dim - 1), dim));
 }
 
+inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->begin_mask & (1 << idx)
+             ? pos_stride ? 0 : dim - 1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->begin)[idx], dim,
+                            pos_stride);
+}
+
+inline int32_t GetEndValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->end_mask & (1 << idx)
+             ? pos_stride ? dim : -1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->end)[idx], dim,
+                            pos_stride);
+}
+
+// Processes the indexing tensors (begin, end and strides) to resize the
+// output tensor. This function is callable from both Prepare() and Eval() as
+// long as the caller ensures the indexing tensors are present.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                StridedSliceContext* op_context) {
+  std::vector<int> output_shape_vector;
+
+  for (int idx = op_context->dims - 1; idx >= 0; --idx) {
+    int32_t stride = GetTensorData<int32_t>(op_context->strides)[idx];
+    TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
+
+    int32_t begin = GetBeginValueAtIndex(op_context, idx);
+    int32_t end = GetEndValueAtIndex(op_context, idx);
+
+    // This is valid for both positive and negative strides
+    int32_t dim_shape = ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!(op_context->params->shrink_axis_mask & (1 << idx))) {
+      output_shape_vector.push_back(dim_shape);
+    }
+  }
+
+  TfLiteIntArray* output_shape =
+      TfLiteIntArrayCreate(output_shape_vector.size());
+
+  std::reverse_copy(output_shape_vector.begin(), output_shape_vector.end(),
+                    output_shape->data);
+
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, op_context->output, output_shape));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  StridedSliceContext op_context(context, node);
+
+  // Ensure validity of input tensor and its dimension
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  // Only INT32 begin/end/strides are supported
+  // TODO(soroosh) add support for INT64
+  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
+  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
+                     "StridedSlice op only supports 1D-4D input arrays.");
+
+  // TODO(soroosh): add the following missing functionalities
+  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
+                     "ellipsis_mask is not implemented yet.");
+  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
+                     "new_axis_mask is not implemented yet.");
+
+  // Postpone allocation of output if any of the indexing tensors is not
+  // constant
+  if (!(IsConstantTensor(op_context.begin) &&
+        IsConstantTensor(op_context.end) &&
+        IsConstantTensor(op_context.strides))) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   StridedSliceContext op_context(context, node);
 
-  std::vector<int> starts;
-  std::vector<int> stops;
-  std::vector<int> strides;
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-  // Determine size of output tensor and map indices
-  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(op_context.dims);
-  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    int dim = op_context.input->dims->data[idx];
-    int32_t stride = GetTensorData<int32_t>(op_context.strides)[idx];
-    TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
-    bool pos_stride = stride > 0;
-
-    int32_t begin =
-        op_context.params->begin_mask & (1 << idx)
-            ? pos_stride ? 0 : dim - 1
-            : ClampedIndex(GetTensorData<int32_t>(op_context.begin)[idx], dim,
-                           pos_stride);
-    int32_t end =
-        op_context.params->end_mask & (1 << idx)
-            ? pos_stride ? dim : -1
-            : ClampedIndex(GetTensorData<int32_t>(op_context.end)[idx], dim,
-                           pos_stride);
+  std::vector<int32_t> starts;
+  std::vector<int32_t> stops;
+  std::vector<int32_t> strides;
 
-    // This is valid for both positive and negative strides
-    output_shape->data[idx] = ceil((end - begin) / static_cast<float>(stride));
-    output_shape->data[idx] =
-        output_shape->data[idx] < 0 ? 0 : output_shape->data[idx];
-    starts.emplace_back(begin);
-    stops.emplace_back(end);
-    strides.emplace_back(stride);
+  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
+    starts.emplace_back(GetBeginValueAtIndex(&op_context, idx));
+    stops.emplace_back(GetEndValueAtIndex(&op_context, idx));
+    strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]);
   }
 
   for (int i = op_context.dims; i < kMaxDim; i++) {
@@ -188,27 +200,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     strides.emplace_back(1);
   }
 
-  TF_LITE_ENSURE_STATUS(
-      context->ResizeTensor(context, op_context.output, output_shape));
-
-  size_t required_bytes;
-  TF_LITE_ENSURE_OK(
-      context,
-      BytesRequired(context, op_context.output->type, output_shape->data,
-                    output_shape->size, &required_bytes));
-  TfLiteTensorRealloc(required_bytes, op_context.output);
-
   op_context.params->begin_mask =
       ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
   op_context.params->end_mask =
       ReverseMaskBits(op_context.params->end_mask, op_context.dims);
-
-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                 \
-  kernel_type::StridedSlice(                                          \
-      GetTensorData<data_type>(op_context.input),                     \
-      GetTensorDims(op_context.input), op_context.params->begin_mask, \
-      op_context.params->end_mask, starts, stops, strides,            \
-      GetTensorData<data_type>(op_context.output),                    \
+  op_context.params->shrink_axis_mask =
+      ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims);
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                      \
+  kernel_type::StridedSlice(                                               \
+      GetTensorData<data_type>(op_context.input),                          \
+      GetTensorDims(op_context.input), op_context.params->begin_mask,      \
+      op_context.params->end_mask, op_context.params->shrink_axis_mask,    \
+      starts, stops, strides, GetTensorData<data_type>(op_context.output), \
       GetTensorDims(op_context.output))
 
   switch (op_context.input->type) {
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index cd4a364682c0e66b2ceec92c0b34461945caf779..5bc7dc353b4904bdb182ce029b9b7c654e4a5f33 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -79,8 +79,6 @@ TEST(StridedSliceOpTest, UnssupportedArgs) {
                "ellipsis_mask is not implemented yet.");
   EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 0, 1),
-               "shrink_axis_mask is not implemented yet.");
 }
 
 TEST(StridedSliceOpTest, In1D) {
@@ -213,6 +211,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
 }
+
 TEST(StridedSliceOpTest, In1D_NegStride) {
   StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
@@ -234,6 +233,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
 }
+
 TEST(StridedSliceOpTest, In1D_OddLenStride2) {
   StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
@@ -255,6 +255,7 @@ TEST(StridedSliceOpTest, In2D_Identity) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
+
 TEST(StridedSliceOpTest, In2D) {
   StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
@@ -320,6 +321,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4}));
 }
+
 TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
   StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
@@ -354,6 +356,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) {
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}));
 }
+
 TEST(StridedSliceOpTest, In3D_Strided2) {
   StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
@@ -365,6 +368,159 @@ TEST(StridedSliceOpTest, In3D_Strided2) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5}));
 }
 
+TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({2});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-2});
+  m.SetEnd({-3});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 7, 8}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 7}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 72f705fe4242b01c1516c99d3500484e8729fd9a..c69755447d5093e25d408eb6dea80750937465e7 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
index 4de2ceaf053df31a4bc857fb250db416c071e80f..0f166dc69b95f3459388135b3a6c4d9b73a31cb4 100644
--- a/tensorflow/contrib/lite/kernels/svdf_test.cc
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite SVDF op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 3a58e7ec321f649a6cae4cc0969807c2c74c6529..6f56aa6bf38781e860e33e8ac3b6a0bb8b50bb01 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -172,11 +172,14 @@ void SingleOpModel::BuildInterpreter(
 
   auto* model = GetModel(builder_.GetBufferPointer());
 
-  ops::builtin::BuiltinOpResolver builtins;
-  for (const auto& reg : custom_registrations_) {
-    builtins.AddCustom(reg.first.data(), reg.second());
+  if (!resolver_) {
+    auto resolver = new ops::builtin::BuiltinOpResolver();
+    for (const auto& reg : custom_registrations_) {
+      resolver->AddCustom(reg.first.data(), reg.second());
+    }
+    resolver_ = std::unique_ptr<OpResolver>(resolver);
   }
-  InterpreterBuilder(model, builtins)(&interpreter_);
+  InterpreterBuilder(model, *resolver_)(&interpreter_);
 
   CHECK(interpreter_ != nullptr);
 
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index cc445299ff9f0b75610c7ff38f28facbbbe5587d..7d476ba1eaffbb24fb77390c0e71c32d60b6411e 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -85,6 +85,23 @@ struct TensorData {
   int32_t zero_point;
 };
 
+class SingleOpResolver : public OpResolver {
+ public:
+  SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
+      : op_(op), registration_(registration) {}
+  TfLiteRegistration* FindOp(BuiltinOperator op) const override {
+    if (op == op_) {
+      return registration_;
+    }
+    return nullptr;
+  }
+  TfLiteRegistration* FindOp(const char* op) const override { return nullptr; }
+
+ private:
+  const BuiltinOperator op_;
+  TfLiteRegistration* registration_;
+};
+
 class SingleOpModel {
  public:
   SingleOpModel() {}
@@ -178,11 +195,16 @@ class SingleOpModel {
     return result;
   }
 
+  void SetResolver(std::unique_ptr<OpResolver> resolver) {
+    resolver_ = std::move(resolver);
+  }
+
  protected:
   int32_t GetTensorSize(int index) const;
 
   flatbuffers::FlatBufferBuilder builder_;
   std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<OpResolver> resolver_;
 
  private:
   int AddTensor(TensorData t, std::initializer_list<int> data);
@@ -197,6 +219,36 @@ class SingleOpModel {
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
 };
 
+// Base class for single op unit tests.
+// The tests are parameterized to test multiple kernels for a single op.
+// The parameters are strings like "optimized" and "reference" to have better
+// readability in test reports.
+//
+// To use this class:
+// * Define a constant map from strings to TfLiteRegistration.
+// * Implement a test class that inherits SingleOpTest.
+// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
+//   function.
+// * Call GetRegistration to get the TfLiteRegistration to be used before
+//   building the interpreter.
+class SingleOpTest : public ::testing::TestWithParam<string> {
+ public:
+  static std::vector<string> GetKernelTags(
+      const std::map<string, TfLiteRegistration*>& kernel_map) {
+    std::vector<string> tags;
+    for (auto it : kernel_map) {
+      tags.push_back(it.first);
+    }
+    return tags;
+  }
+
+ protected:
+  virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
+  TfLiteRegistration* GetRegistration() {
+    return GetKernelMap().at(GetParam());
+  }
+};
+
 // Strings have a special implementation that is in test_util.cc
 template <>
 std::vector<string> SingleOpModel::ExtractVector(int index);
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc
index 75d8136b6a26efd805d9fc8e9db26dce2cfcfcb1..093814bc4494041809d6e82965519e38a8997f49 100644
--- a/tensorflow/contrib/lite/kernels/transpose.cc
+++ b/tensorflow/contrib/lite/kernels/transpose.cc
@@ -31,60 +31,78 @@ enum KernelType {
   kReference,
 };
 
-// TODO(nupurgarg): Permutation arrays represented as a tensor are ignored. Only
-// use the `perm` specified in `params`.
 struct TransposeContext {
   TransposeContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteTransposeParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    perm = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTransposeParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* perm;
   TfLiteTensor* output;
 };
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                TransposeContext* op_context) {
+  int dims = NumDimensions(op_context->input);
+  const int* perm_data = GetTensorData<int32_t>(op_context->perm);
 
-  TransposeContext op_context(context, node);
-  int dims = NumDimensions(op_context.input);
-
-  // Ensure validity of input tensor and permutation array.
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  TF_LITE_ENSURE_EQ(context, dims, op_context.params->num_dimensions);
-  TF_LITE_ENSURE_MSG(context, dims <= 4,
-                     "Transpose op only supports 1D-4D input arrays.");
+  // Ensure validity of the permutations tensor as a 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->perm), 1);
+  TF_LITE_ENSURE_EQ(context, op_context->perm->dims->data[0], dims);
   for (int idx = 0; idx < dims; ++idx) {
-    TF_LITE_ENSURE_MSG(context,
-                       op_context.params->perm[idx] >= 0 &&
-                           op_context.params->perm[idx] < dims,
+    TF_LITE_ENSURE_MSG(context, (perm_data[idx] >= 0 && perm_data[idx] < dims),
                        "Transpose op permutations array is out of bounds.");
   }
 
   // Determine size of output tensor.
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(dims);
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   for (int idx = 0; idx < dims; ++idx) {
-    output_size->data[idx] = input_size->data[op_context.params->perm[idx]];
+    output_size->data[idx] = input_size->data[perm_data[idx]];
   }
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TransposeContext op_context(context, node);
+
+  // Ensure validity of input tensor.
+  TF_LITE_ENSURE_MSG(context, NumDimensions(op_context.input) <= 4,
+                     "Transpose op only supports 1D-4D input arrays.");
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.perm)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TransposeContext op_context(context, node);
 
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
   // Reverse the permuted axes and convert to 4D due to the way Dims are
   // constructed in GetTensorDims.
+  const int* perm_data = GetTensorData<int32_t>(op_context.perm);
+  const int size = op_context.perm->dims->data[0];
   const int kOutputDimensionNum = 4;
   int reversed_perm[kOutputDimensionNum];
-  int size = op_context.params->num_dimensions;
+
   for (int output_k = 0, input_k = size - 1; output_k < size;
        ++output_k, --input_k) {
-    reversed_perm[output_k] = size - op_context.params->perm[input_k] - 1;
+    reversed_perm[output_k] = size - perm_data[input_k] - 1;
   }
   for (int k = size; k < kOutputDimensionNum; ++k) {
     reversed_perm[k] = k;
diff --git a/tensorflow/contrib/lite/kernels/transpose_test.cc b/tensorflow/contrib/lite/kernels/transpose_test.cc
index 7f5832cd5fa3d502b52bf5554111b45136b588ae..337bc144b967392523bf784603cca4c1b968cdf2 100644
--- a/tensorflow/contrib/lite/kernels/transpose_test.cc
+++ b/tensorflow/contrib/lite/kernels/transpose_test.cc
@@ -127,61 +127,124 @@ TEST(TransposeTest, TestRefOps4D) {
 
 class TransposeOpModel : public SingleOpModel {
  public:
-  TransposeOpModel(std::initializer_list<int> input_shape,
-                   std::initializer_list<int> perm) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(
-        BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
-        CreateTransposeOptions(builder_, builder_.CreateVector<int>(perm))
-            .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetPerm(std::initializer_list<int> data) {
+    PopulateTensor<int>(perm_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int perm_;
   int output_;
 };
 
+// Tests case where perm is a const tensor.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, perm_shape, perm_data);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class TransposeOpConstModel : public TransposeOpModel {
+ public:
+  TransposeOpConstModel(std::initializer_list<int> input_shape,
+                        std::initializer_list<int> perm_shape,
+                        std::initializer_list<int> perm) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddConstInput(TensorType_INT32, perm, perm_shape);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where perm is a non-const tensor.
+//
+// Example usage is as follows:
+//    TransposeOpDynamicModel m(input_shape, perm_shape);
+//    m.SetInput(input_data);
+//    m.SetPerm(perm_data);
+//    m.Invoke();
+class TransposeOpDynamicModel : public TransposeOpModel {
+ public:
+  TransposeOpDynamicModel(std::initializer_list<int> input_shape,
+                          std::initializer_list<int> perm_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape, perm_shape});
+  }
+};
+
 TEST(TransposeTest, TestUnequalPermSize) {
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {2, 2}),
-               "dims != op_context.params->num_dimensions");
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
 }
 
 TEST(TransposeTest, TestPermOutOfBounds) {
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, -1, -2, -3}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, -1, -2, -3}),
                "Transpose op permutations array is out of bounds.");
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, 1, 2, 4}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}),
                "Transpose op permutations array is out of bounds.");
 }
 
-TEST(TransposeTest, Test1DInputTensor) {
-  TransposeOpModel m({3}, {0});
+TEST(TransposeTest, Test1DInputConstTensor) {
+  TransposeOpConstModel m({3}, {1}, {0});
   m.SetInput({1, 2, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
 }
 
-TEST(TransposeTest, Test2DInputTensor) {
-  TransposeOpModel m({3, 2}, {1, 0});
+TEST(TransposeTest, Test1DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3}, {1});
+  m.SetInput({1, 2, 3});
+  m.SetPerm({0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(TransposeTest, Test2DInputConstTensor) {
+  TransposeOpConstModel m({3, 2}, {2}, {1, 0});
+  m.SetInput({0, 1, 2, 3, 4, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
+}
+
+TEST(TransposeTest, Test2DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3, 2}, {2});
   m.SetInput({0, 1, 2, 3, 4, 5});
+  m.SetPerm({1, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
 }
 
-TEST(TransposeTest, Test3DInputTensor) {
-  TransposeOpModel m({2, 3, 4}, {2, 0, 1});
+TEST(TransposeTest, Test3DInputConstTensor) {
+  TransposeOpConstModel m({2, 3, 4}, {3}, {2, 0, 1});
+  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                                2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
+}
+
+TEST(TransposeTest, Test3DInputDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4}, {3});
   m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
               12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.SetPerm({2, 0, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3}));
   EXPECT_THAT(m.GetOutput(),
@@ -190,28 +253,64 @@ TEST(TransposeTest, Test3DInputTensor) {
 }
 
 TEST(TransposeTest, Test5DInputTensor) {
-  EXPECT_DEATH(TransposeOpModel({1, 2, 3, 4, 5}, {0, 1, 2, 3, 4}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}),
                "Transpose op only supports 1D-4D input arrays.");
 }
 
-TEST(TransposeTest, SimpleTestNoReorder) {
-  TransposeOpModel m({1, 2, 3, 1}, {0, 1, 2, 3});
+TEST(TransposeTest, SimpleTestNoReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(TransposeTest, SimpleTestNoReorderDynamicTensor) {
+  TransposeOpDynamicModel m({1, 2, 3, 1}, {4});
   m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPerm({0, 1, 2, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
 
-TEST(TransposeTest, SimpleTestWithReorder) {
-  TransposeOpModel m({1, 2, 3, 1}, {2, 1, 3, 0});
+TEST(TransposeTest, SimpleTestWithReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {2, 1, 3, 0});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2, 1, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
 }
 
-TEST(TransposeTest, ComplexTestWithReorder) {
-  TransposeOpModel m({2, 3, 4, 5}, {2, 0, 1, 3});
+TEST(TransposeTest, ComplexTestWithReorderConstTensor) {
+  TransposeOpConstModel m({2, 3, 4, 5}, {4}, {2, 0, 1, 3});
+  m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
+              12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+              24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+              36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+              48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+              60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+              72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+              84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+              96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+              108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5}));
+  auto result = ElementsAreArray(
+      {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+       60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+       5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+       65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+       10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+       70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+       15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
+  EXPECT_THAT(m.GetOutput(), result);
+}
+
+TEST(TransposeTest, ComplexTestWithReorderDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4, 5}, {4});
   m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
               12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
               24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
@@ -222,6 +321,7 @@ TEST(TransposeTest, ComplexTestWithReorder) {
               84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
               96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
               108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119});
+  m.SetPerm({2, 0, 1, 3});
   m.Invoke();
 
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5}));
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index f5f1ec2cf3f45ae730b849b18e2b85fac50159c7..7ce87e4deb81b95ae08b0c55397873ae4582eec6 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -82,8 +82,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_size_array->data[0] = (time_major) ? max_time : batch_size;
   output_size_array->data[1] = (time_major) ? batch_size : max_time;
   output_size_array->data[2] = num_units;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
-                                                   output_size_array));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
index 82c680ec3d8656004d721c8498292677cb061b6b..7e32969763b59620dc3534708f965750680002d2 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Sequential RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -120,8 +120,7 @@ static float rnn_golden_output[] = {
 
     0.415153,   0.210318,   0,         0,         0,         0,
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
-    0.628881,   3.58099,    1.49974,   0
-};
+    0.628881,   3.58099,    1.49974,   0};
 
 class UnidirectionalRNNOpModel : public SingleOpModel {
  public:
diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b58ae266017caf8781c28331f49a8f5bc1550767
--- /dev/null
+++ b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
@@ -0,0 +1,81 @@
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+echo "Starting"
+TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
+
+TMP_DIR=$(mktemp -d)
+echo "Package dir: " $TMP_DIR
+FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
+FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework
+FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
+
+echo "Creating target Headers directories"
+mkdir -p $FW_DIR_TFLITE_HDRS
+
+echo "Headers, populating: TensorFlow Lite"
+cd $TFLITE_DIR/../../..
+
+find tensorflow/contrib/lite -name '*.h' \
+    -not -path 'tensorflow/contrib/lite/downloads/*' \
+    -not -path 'tensorflow/contrib/lite/examples/*' \
+    -not -path 'tensorflow/contrib/lite/gen/*' \
+    -not -path 'tensorflow/contrib/lite/toco/*' \
+    -not -path 'tensorflow/contrib/lite/nnapi/*' \
+    -not -path 'tensorflow/contrib/lite/java/*' \
+    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: Flatbuffer"
+cd $TFLITE_DIR/downloads/flatbuffers/include/
+find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+cd $TFLITE_DIR/../../..
+echo "Generate master LICENSE file and copy to target"
+bazel build //tensorflow/tools/lib_package:clicenses_generate
+cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+   $FW_DIR_TFLITE
+
+echo "Copying static libraries"
+cp $TFLITE_DIR/gen/lib/libtensorflow-lite.a \
+   $FW_DIR_TFLITE/tensorflow_lite
+
+# This is required, otherwise they interfere with the documentation of the
+# pod at cocoapods.org.
+echo "Remove all README files"
+cd $FW_DIR_TFLITE_HDRS
+find . -type f -name README\* -exec rm -f {} \;
+find . -type f -name readme\* -exec rm -f {} \;
+
+TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
+echo "Moving results to target: " $TARGET_GEN_LOCATION
+cd $FW_DIR
+zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store
+rm -rf $TARGET_GEN_LOCATION
+mkdir -p $TARGET_GEN_LOCATION
+cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION
+
+echo "Cleaning up"
+rm -rf $TMP_DIR
+
+echo "Finished"
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 303a10af03e582d5e4e641c15072e1c9d594e1f4..b36bfcef843cc3b7a9647192c18dfb0dbf6307e9 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -30,17 +30,6 @@ limitations under the License.
 
 namespace tflite {
 
-namespace {
-inline const tflite::Model* VerifyAndGetModel(const void* buf, size_t len) {
-  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
-  if (VerifyModelBuffer(verifier)) {
-    return ::tflite::GetModel(buf);
-  } else {
-    return nullptr;
-  }
-}
-}  // namespace
-
 const char* kEmptyTensorName = "";
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
@@ -82,7 +71,7 @@ FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
   }
   if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
-  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+  model_ = ::tflite::GetModel(allocation_->base());
 }
 
 bool FlatBufferModel::CheckModelIdentifier() const {
@@ -103,7 +92,7 @@ FlatBufferModel::FlatBufferModel(const char* ptr, size_t num_bytes,
   allocation_ = new MemoryAllocation(ptr, num_bytes, error_reporter);
   if (!allocation_->valid()) return;
 
-  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+  model_ = ::tflite::GetModel(allocation_->base());
 }
 
 FlatBufferModel::FlatBufferModel(const Model* model,
@@ -339,6 +328,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: {
       TfLiteSequenceRNNParams* params = MallocPOD<TfLiteSequenceRNNParams>();
       if (auto* sequence_rnn_params =
@@ -476,6 +466,11 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_RESIZE_BILINEAR: {
+      auto* params = MallocPOD<TfLiteResizeBilinearParams>();
+      if (auto* schema_params =
+              op->builtin_options_as_ResizeBilinearOptions()) {
+      }
+      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_PAD: {
@@ -521,62 +516,18 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_SPACE_TO_BATCH_ND: {
-      auto* params = MallocPOD<TfLiteSpaceToBatchNDParams>();
-      if (auto* schema_params =
-              op->builtin_options_as_SpaceToBatchNDOptions()) {
-        const auto& block_shape = schema_params->block_shape();
-        FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape,
-                                   params->block_shape, error_reporter);
-        const auto& before_paddings = schema_params->before_paddings();
-        FlatBufferIntVectorToArray(sizeof(params->before_paddings),
-                                   before_paddings, params->before_paddings,
-                                   error_reporter);
-        const auto& after_paddings = schema_params->after_paddings();
-        FlatBufferIntVectorToArray(sizeof(params->after_paddings),
-                                   after_paddings, params->after_paddings,
-                                   error_reporter);
-        params->num_spatial_dimensions = block_shape->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BATCH_TO_SPACE_ND: {
-      auto* params = MallocPOD<TfLiteBatchToSpaceNDParams>();
-      if (auto* schema_params =
-              op->builtin_options_as_BatchToSpaceNDOptions()) {
-        const auto& block_shape = schema_params->block_shape();
-        FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape,
-                                   params->block_shape, error_reporter);
-        const auto& before_crops = schema_params->before_crops();
-        FlatBufferIntVectorToArray(sizeof(params->before_crops), before_crops,
-                                   params->before_crops, error_reporter);
-        const auto& after_crops = schema_params->after_crops();
-        FlatBufferIntVectorToArray(sizeof(params->after_crops), after_crops,
-                                   params->after_crops, error_reporter);
-        params->num_spatial_dimensions = block_shape->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_TRANSPOSE: {
-      auto* params = MallocPOD<TfLiteTransposeParams>();
-      if (auto* schema_params = op->builtin_options_as_TransposeOptions()) {
-        const auto& perm = schema_params->perm();
-        FlatBufferIntVectorToArray(sizeof(params->perm), perm, params->perm,
-                                   error_reporter);
-        params->num_dimensions = perm->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MEAN: {
       auto* params = MallocPOD<TfLiteMeanParams>();
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
-        const auto& axis = schema_params->axis();
-        FlatBufferIntVectorToArray(sizeof(params->axis), axis, params->axis,
-                                   error_reporter);
         params->keep_dims = schema_params->keep_dims();
-        params->num_axis_dimensions = axis->Length();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 5330c8f594593655b2a8776cf6b399c0d16cdc19..66f22fd66a9ae0d35553a1f780ef73a5c5994c99 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
-#include <string>
 
 #include "tensorflow/contrib/lite/model.h"
 
@@ -247,14 +246,6 @@ TEST(BasicFlatBufferModel, TestNullErrorReporter) {
   ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
 }
 
-// Test what happens if we cannot bind any of the ops.
-TEST(BasicFlatBufferModel, TestBuildModelFromCorruptedData) {
-  std::string corrupted_data = "123";
-  auto model = FlatBufferModel::BuildFromBuffer(corrupted_data.c_str(),
-                                                corrupted_data.length());
-  ASSERT_FALSE(model);
-}
-
 // Test that loading model directly from a Model flatbuffer works.
 TEST(BasicFlatBufferModel, TestBuildFromModel) {
   TestErrorReporter reporter;
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 7019c29959fc02f4f84d1e4c8cf280751e585de0..76032771af2c8e099aed498b2071816646f3b606 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -1571,7 +1571,7 @@ inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
 }
 
 /**
- * Specfifies which operands will be the model's inputs and outputs.
+ * Specifies which operands will be the model's inputs and outputs.
  *
  * An operand cannot be used for both input and output. Doing so will
  * return an error.
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index d5b9319407a461c571411c44ae702c137c914fa9..da9ceec2f1401745ba477824bf494ee5b0ee1187 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -319,6 +319,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_SVDF:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
       case tflite::BuiltinOperator_RNN:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 3d6a3ec0fd4c673f601254b19452bbf8b9454e27..2d8c49b7d7a5ae5c180f100a399a1870679c455f 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -13,6 +13,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":op_hint",
         "//tensorflow/contrib/lite/toco:model_flags_proto_py",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
         "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
@@ -20,6 +21,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "op_hint",
+    srcs = ["op_hint.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
@@ -27,6 +39,7 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":lite",
+        ":op_hint",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 3c369774beda57cca3bc1ea0ab9a9ad619841e7e..5d2f21653762a405a57288a7ba38323e5e42b3e1 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -18,16 +18,21 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@toco_convert
 @@toco_convert_protos
+@@OpHint
+@@convert_op_hints_to_stubs
 
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import os
 import subprocess
 import tempfile
 
+# pylint: disable=unused-import
+from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.contrib.lite.python.op_hint import OpHint
+# pylint: enable=unused-import
 from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 7d55f3fe6fe41a5d9e4e57c7a8e664bba6887fc7..b8b4510188bee867b32ffde714b27f41a1df778a 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -18,10 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
+from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -35,7 +39,8 @@ class LiteTest(test_util.TensorFlowTestCase):
     # Try running on valid graph
     result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
     self.assertTrue(result)
-    # TODO(aselle): remove tests that fail.
+    # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
+    # all the time).
     # Try running on identity graph (known fail)
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
     #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
@@ -51,5 +56,116 @@ class LiteTest(test_util.TensorFlowTestCase):
                                quantized_input_stats=[(0., 1.)])
     self.assertTrue(result)
 
+
+class LiteTestOpHint(test_util.TensorFlowTestCase):
+  """Test the hint to stub functionality."""
+
+  def _getGraphOpTypes(self, graphdef, output_nodes):
+    """Returns used op types in `graphdef` reachable from `output_nodes`.
+
+    This is used to check that after the stub transformation the expected
+    nodes are there. Typically use this with self.assertCountEqual(...).
+
+    NOTE: this is not a exact test that the graph is the correct output, but
+      it balances compact expressibility of test with sanity checking.
+
+    Args:
+      graphdef: TensorFlow proto graphdef.
+      output_nodes: A list of output node names that we need to reach.
+
+    Returns:
+      A set of node types reachable from `output_nodes`.
+    """
+    name_to_input_name, name_to_node, _ = (
+        _extract_graph_summary(graphdef))
+    # Find all nodes that are needed by the outputs
+    used_node_names = _bfs_for_reachable_nodes(output_nodes, name_to_input_name)
+    return set([name_to_node[node_name].op for node_name in used_node_names])
+
+  def _countIdentities(self, nodes):
+    """Count the number of "Identity" op types in the list of proto nodes.
+
+    Args:
+      nodes: NodeDefs of the graph.
+
+    Returns:
+      The number of nodes with op type "Identity" found.
+    """
+    return len([x for x in nodes if x.op == "Identity"])
+
+  def testSwishLiteHint(self):
+    """Makes a custom op swish and makes sure it gets converted as a unit."""
+    image = array_ops.constant([1., 2., 3., 4.])
+    swish_scale = array_ops.constant(1.0)
+
+    def _swish(input_tensor, scale):
+      custom = lite.OpHint("cool_activation")
+      input_tensor, scale = custom.add_inputs(input_tensor, scale)
+      output = math_ops.sigmoid(input_tensor) * input_tensor * scale
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(_swish(image, swish_scale), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # check if identities have been put into the graph (2 input, 1 output,
+      # and 1 final output).
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["cool_activation", "Const", "Identity"])
+
+  def testScaleAndBiasAndIdentity(self):
+    """This tests a scaled add which has 3 inputs and 2 outputs."""
+    a = array_ops.constant(1.)
+    x = array_ops.constant([2., 3.])
+    b = array_ops.constant([4., 5.])
+
+    def _scaled_and_bias_and_identity(a, x, b):
+      custom = lite.OpHint("scale_and_bias_and_identity")
+      a, x, b = custom.add_inputs(a, x, b)
+      return custom.add_outputs(a * x + b, x)
+    output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
+                                name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (3) and output (2) => 3 + 2 = 5
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
+
+  def testTwoFunctions(self):
+    """Tests if two functions are converted correctly."""
+    a = array_ops.constant([1.])
+    b = array_ops.constant([1.])
+    def _double_values(x):
+      custom = lite.OpHint("add_test")
+      x = custom.add_inputs(x)
+      output = math_ops.multiply(x, x)
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(
+        math_ops.add(_double_values(a), _double_values(b)), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (2) and output (2) => 2 + 2
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["add_test", "Const", "Identity", "Add"])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c587e38b16dc3011fc7c8bef4eec4d0ea99ec21
--- /dev/null
+++ b/tensorflow/contrib/lite/python/op_hint.py
@@ -0,0 +1,291 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Define tflite op hints (intrinsic operations).
+
+This essentially allows defining a TensorFlow API for tflite operations in
+Python with hints on how they are represented in TensorFlow Lite. This basically
+is a form of tflite intrinsic. It wraps a subpart of a TensorFlow execution
+graph and is useful for LSTMs and other complicated TensorFlow constructions
+that are difficult to pattern match in TOCO, but are represented by a single
+accelerated tflite op.
+
+Example:
+  def tflite_cool_activation(input):
+    # A cool activation function.
+    custom = tf.contrib.lite.OpHint("cool_activation")
+    input = custom.add_inputs(input)
+    output = tf.sigmoid(input) * input
+    custom.add_outputs(output)
+    return output
+
+  image = tf.placeholder(tf.float32, (1, 16, 16, 1))
+  output = tf.identity(tflite_cool_activation(image))
+
+  session = tf.Session()
+
+  graphdef_to_convert = tf.contrib.lite.convert_op_hints_to_stubs(session)
+  tflite_graph = tf.contrib.lite.toco_convert(graphdef_to_convert,
+                                              [image], [output])
+                                              [image], [output])
+  with open("/tmp/graph.fb", "wb") as fp:
+    fp.write(tflite_graph)
+
+How does it work?:
+
+OpHint is a helper that you use when defining a vanilla python function.
+It allows you to wrap arguments with tf.identities with some custom attributes.
+These attributes allow you to find the original block of ops that was created.
+For example, if you use cool_activation above you essentially get:
+
+a_input = tf.identity()
+result = tf.multiply(tf.sigmoid(a_input), a_input)
+output = tf.identity()
+
+a_input, output are identities that have parameters representing
+what argument they are, what the name of the function they should turn into
+in tf lite as well as a guid that uniquely identifies a particular invocation.
+
+Once you have built your whole tensorflow graph, you can run it and train it
+as usual, but after you have done that, you need to convert the graph into
+a form that replaces these subgraphs wrapped in identities to stub ops. These
+ops don't actually exist in the normal TensorFlow runtime, but will be
+understood by toco later.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as _collections
+import itertools as _itertools
+import uuid as _uuid
+
+from tensorflow.contrib import framework as _framework
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+class OpHint(object):
+  """A class that helps build tflite function invocations.
+
+  It allows you to take a bunch of TensorFlow ops and annotate the construction
+  such that toco knows how to convert it to tflite. This embeds a pseudo
+  function in a TensorFlow graph. This allows embedding high-level API usage
+  information in a lower level TensorFlow implementation so that an alternative
+  implementation can be substituted later.
+
+  Essentially, any "input" into this pseudo op is fed into an identity, and
+  attributes are added to that input before being used by the constituent ops
+  that make up the pseudo op. A similar process is done to any output that
+  is to be exported from the current op.
+
+  TODO(aselle): When TensorFlow functions functionality works for arbitrary
+  constructs, this mechanism can be retired and changed to use python defun's.
+  """
+
+  # Attr constants that are used for representation in the GraphDef
+  FUNCTION_NAME_ATTR = "_tflite_function_name"
+  FUNCTION_UUID_ATTR = "_tflite_function_uuid"
+  FUNCTION_INPUT_INDEX_ATTR = "_tflite_function_input_index"
+  FUNCTION_OUTPUT_INDEX_ATTR = "_tflite_function_output_index"
+
+  def __init__(self, function_name, **kwargs):
+    """Create a OpHint.
+
+    Args:
+      function_name: Name of the function (the custom op name in tflite)
+      **kwargs: Keyword arguments of any constant attributes for the function.
+    """
+    self._function_name = function_name
+    self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
+    self._curr_input_index = 0
+    self._curr_output_index = 0
+    self._attrs_to_store_later = kwargs
+    self._stored_attrs = False
+
+  def _setattr(self, dest_op, name, value):
+    tensor_value = _ops.convert_to_tensor(value)
+    dest_op.op.node_def.attr[name].tensor.CopyFrom(
+        tensor_value.op.node_def.attr["value"].tensor)
+
+  def add_inputs(self, *args):
+    """Add a sequence of inputs to the function invocation.
+
+    Args:
+      *args: List of inputs to be converted (should be Tf.Tensor).
+    Returns:
+      Wrapped inputs (identity standins that have additional metadata). These
+      are also are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      attr = identity_op.op.node_def.attr
+      attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name
+      attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id
+      attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i = self._curr_input_index
+      self._curr_input_index += 1
+      return identity_op
+
+    return [augmented_identity(arg) for arg in args]
+
+  def add_outputs(self, *args):
+    """Add a sequence of outputs to the function invocation.
+
+    Args:
+      *args: List of outputs to be converted (should be tf.Tensor).
+    Returns:
+      Wrapped outputs (identity standins that have additional metadata). These
+      are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      attr = identity_op.op.node_def.attr
+      attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name
+      attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id
+      attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i = self._curr_output_index
+      self._curr_output_index += 1
+      return identity_op
+
+    wrapped_outputs = [augmented_identity(arg) for arg in args]
+
+    if not self._stored_attrs:
+      for key, value in self._attrs_to_store_later.iteritems():
+        self._setattr(wrapped_outputs[0], "_tflite_attr_" + key, value)
+      self._stored_attrs = True
+
+    return wrapped_outputs
+
+
+class _LiteFuncCall(object):
+  """Represent a TensorFlow Lite custom function.
+
+  This is uses to accumulate found hints in the graphdef into a single
+  conceptual unit.
+
+  Properties:
+    self.inputs: inputs to the op (hash from index # to argument)
+    self.outputs: outputs to the op (hash from index # to argument)
+    self.function_name: the tflite custom op name to use
+    self.uuid: a unique call id for this particular call  (i.e.
+      multiple function calls would have the same function_name but different
+      uuids.
+    self.params: A param name to key value for op constant data. I.e. for
+      axis on a reduction, strides on a convolution, etc.
+  """
+
+  def __init__(self):
+    self.inputs = {}
+    self.outputs = {}
+    self.function_name = None
+    self.uuid = None
+    self.params = {}
+
+  def __str__(self):
+    return "tflite function %s call %s\n\tinputs: %r\n\toutputs: %r" % (
+        self.function_name, self.uuid, self.inputs, self.outputs)
+
+
+def _find_all_hints_in_graph_def(session):
+  """Look at the current default graph and return a list of LiteFuncCall objs.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    a list of `LifeFuncCall` objects in the form
+
+  """
+  func_calls = _collections.defaultdict(_LiteFuncCall)
+  seen_ops = set()
+
+  for op in session.graph.get_operations():
+    for operand in _itertools.chain(op.inputs, op.outputs):
+      if operand in seen_ops:
+        continue
+      seen_ops.add(operand)
+      attr = operand.op.node_def.attr
+      uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
+      if OpHint.FUNCTION_UUID_ATTR not in attr:
+        continue
+      call_def = func_calls[uuid]
+      call_def.uuid = uuid
+      if OpHint.FUNCTION_UUID_ATTR in attr:
+        call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+        if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
+          call_def.inputs[attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i] = operand
+        if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
+          call_def.outputs[attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i] = operand
+
+      for a in attr:
+        if a.startswith("_tflite_attr_"):
+          # TODO(aselle): Remember the attribute tensors so we can put them
+          # in collapse.
+          call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
+
+  return func_calls
+
+
+def _tensor_name_base(full_tensor_name):
+  """Removes the device assignment code from a tensor.
+
+  e.g. _tensor_name_base("foo:3") => "foo"
+
+  Args:
+    full_tensor_name: A tensor name that is annotated with a device placement
+      (this is what tensor flow introspection gives).
+  Returns:
+    A name without any device assignment.
+  """
+  return full_tensor_name.name.split(":")[0]
+
+
+def convert_op_hints_to_stubs(session):
+  """Converts a graphdef with LiteOp hints into stub operations.
+
+  This is used to prepare for toco conversion of complex intrinsic usages.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    A new graphdef with all ops contained in OpHints being replaced by
+    a single op call with the right parameters.
+  """
+  hints = _find_all_hints_in_graph_def(session)
+  current_graph_def = session.graph_def
+  for call in hints.values():
+    input_names = [None] * len(call.inputs)
+    output_names = [None] * len(call.outputs)
+    output_dtypes = [None] * len(call.outputs)
+    output_quantized = False
+    for input_index, tensor in call.inputs.items():
+      input_names[input_index] = _tensor_name_base(tensor)
+    for output_index, tensor in call.outputs.items():
+      output_names[output_index] = _tensor_name_base(tensor)
+      output_dtypes[output_index] = tensor.dtype.as_datatype_enum
+    # TODO(aselle): Support quantized flag properly
+    current_graph_def = _framework.fuse_op(
+        current_graph_def, input_names, output_names, output_dtypes,
+        output_quantized, call.uuid, call.function_name)
+    for node in current_graph_def.node:
+      if node.name == call.uuid:
+        for param, tensor in call.params.items():
+          node.attr[param].tensor.CopyFrom(tensor)
+  return current_graph_def
+
+
+_allowed_symbols = ["OpHint", "convert_op_hints_to_stubs"]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index ec202cd4073f152e1b2f4d5efd443615e901afc6..c0b220e872176af46035f2ace8defca17e59ea1d 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -119,6 +119,7 @@ enum BuiltinOperator : byte {
   SQUEEZE = 43,
   UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
 }
 
 // Options for the builtin operators.
@@ -224,6 +225,12 @@ table SequenceRNNOptions {
   fused_activation_function:ActivationFunctionType;
 }
 
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+}
+
 // An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
 table FullyConnectedOptions {
   fused_activation_function:ActivationFunctionType;
@@ -282,15 +289,9 @@ table ReshapeOptions {
 }
 
 table SpaceToBatchNDOptions {
-  block_shape:[int];
-  before_paddings:[int];
-  after_paddings:[int];
 }
 
 table BatchToSpaceNDOptions {
-  block_shape:[int];
-  before_crops:[int];
-  after_crops:[int];
 }
 
 table SkipGramOptions {
@@ -326,11 +327,9 @@ table GatherOptions {
 }
 
 table TransposeOptions {
-  perm:[int];
 }
 
 table MeanOptions {
-  axis:[int];
   keep_dims: bool;
 }
 
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
old mode 100644
new mode 100755
index c04a73a2bf00807442967499cceaaee941e54278..29f3a17be7aef71fbd19fdf2dd390421b724b53a
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -51,6 +51,9 @@ struct RNNOptionsT;
 struct SequenceRNNOptions;
 struct SequenceRNNOptionsT;
 
+struct BidirectionalSequenceRNNOptions;
+struct BidirectionalSequenceRNNOptionsT;
+
 struct FullyConnectedOptions;
 struct FullyConnectedOptionsT;
 
@@ -211,11 +214,12 @@ enum BuiltinOperator {
   BuiltinOperator_SQUEEZE = 43,
   BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   BuiltinOperator_STRIDED_SLICE = 45,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_STRIDED_SLICE
+  BuiltinOperator_MAX = BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[44] {
   static BuiltinOperator values[] = {
       BuiltinOperator_ADD,
       BuiltinOperator_AVERAGE_POOL_2D,
@@ -259,7 +263,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] {
       BuiltinOperator_DIV,
       BuiltinOperator_SQUEEZE,
       BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-      BuiltinOperator_STRIDED_SLICE};
+      BuiltinOperator_STRIDED_SLICE,
+      BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN};
   return values;
 }
 
@@ -310,6 +315,7 @@ inline const char **EnumNamesBuiltinOperator() {
                                 "SQUEEZE",
                                 "UNIDIRECTIONAL_SEQUENCE_LSTM",
                                 "STRIDED_SLICE",
+                                "BIDIRECTIONAL_SEQUENCE_RNN",
                                 nullptr};
   return names;
 }
@@ -2005,6 +2011,85 @@ flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
+  typedef BidirectionalSequenceRNNOptions TableType;
+  bool time_major;
+  ActivationFunctionType fused_activation_function;
+  BidirectionalSequenceRNNOptionsT()
+      : time_major(false),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef BidirectionalSequenceRNNOptionsT NativeTableType;
+  enum { VT_TIME_MAJOR = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  bool time_major() const { return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0; }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceRNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceRNNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR,
+                             static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(
+        BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+        static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit BidirectionalSequenceRNNOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BidirectionalSequenceRNNOptionsBuilder &operator=(
+      const BidirectionalSequenceRNNOptionsBuilder &);
+  flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
   typedef FullyConnectedOptions TableType;
   ActivationFunctionType fused_activation_function;
@@ -2541,21 +2626,14 @@ flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
 
 struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
-  int32_t new_height;
-  int32_t new_width;
-  ResizeBilinearOptionsT() : new_height(0), new_width(0) {}
+  ResizeBilinearOptionsT() {}
 };
 
 struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
-  enum { VT_NEW_HEIGHT = 4, VT_NEW_WIDTH = 6 };
-  int32_t new_height() const { return GetField<int32_t>(VT_NEW_HEIGHT, 0); }
-  int32_t new_width() const { return GetField<int32_t>(VT_NEW_WIDTH, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_NEW_HEIGHT) &&
-           VerifyField<int32_t>(verifier, VT_NEW_WIDTH) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   ResizeBilinearOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2570,13 +2648,6 @@ struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
 struct ResizeBilinearOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_new_height(int32_t new_height) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height,
-                             0);
-  }
-  void add_new_width(int32_t new_width) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_WIDTH, new_width, 0);
-  }
   explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2590,11 +2661,8 @@ struct ResizeBilinearOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t new_height = 0,
-    int32_t new_width = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   ResizeBilinearOptionsBuilder builder_(_fbb);
-  builder_.add_new_width(new_width);
-  builder_.add_new_height(new_height);
   return builder_.Finish();
 }
 
@@ -2766,33 +2834,14 @@ flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
 
 struct SpaceToBatchNDOptionsT : public flatbuffers::NativeTable {
   typedef SpaceToBatchNDOptions TableType;
-  std::vector<int32_t> block_shape;
-  std::vector<int32_t> before_paddings;
-  std::vector<int32_t> after_paddings;
   SpaceToBatchNDOptionsT() {}
 };
 
 struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef SpaceToBatchNDOptionsT NativeTableType;
-  enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_PADDINGS = 6, VT_AFTER_PADDINGS = 8 };
-  const flatbuffers::Vector<int32_t> *block_shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_SHAPE);
-  }
-  const flatbuffers::Vector<int32_t> *before_paddings() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_PADDINGS);
-  }
-  const flatbuffers::Vector<int32_t> *after_paddings() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_PADDINGS);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_BLOCK_SHAPE) &&
-           verifier.Verify(block_shape()) &&
-           VerifyOffset(verifier, VT_BEFORE_PADDINGS) &&
-           verifier.Verify(before_paddings()) &&
-           VerifyOffset(verifier, VT_AFTER_PADDINGS) &&
-           verifier.Verify(after_paddings()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   SpaceToBatchNDOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2807,18 +2856,6 @@ struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS
 struct SpaceToBatchNDOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_block_shape(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_BLOCK_SHAPE, block_shape);
-  }
-  void add_before_paddings(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_paddings) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_BEFORE_PADDINGS, before_paddings);
-  }
-  void add_after_paddings(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_paddings) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_AFTER_PADDINGS, after_paddings);
-  }
   explicit SpaceToBatchNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2832,62 +2869,25 @@ struct SpaceToBatchNDOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_paddings = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_paddings = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   SpaceToBatchNDOptionsBuilder builder_(_fbb);
-  builder_.add_after_paddings(after_paddings);
-  builder_.add_before_paddings(before_paddings);
-  builder_.add_block_shape(block_shape);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SpaceToBatchNDOptions>
-CreateSpaceToBatchNDOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *block_shape = nullptr,
-    const std::vector<int32_t> *before_paddings = nullptr,
-    const std::vector<int32_t> *after_paddings = nullptr) {
-  return tflite::CreateSpaceToBatchNDOptions(
-      _fbb, block_shape ? _fbb.CreateVector<int32_t>(*block_shape) : 0,
-      before_paddings ? _fbb.CreateVector<int32_t>(*before_paddings) : 0,
-      after_paddings ? _fbb.CreateVector<int32_t>(*after_paddings) : 0);
-}
-
 flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable {
   typedef BatchToSpaceNDOptions TableType;
-  std::vector<int32_t> block_shape;
-  std::vector<int32_t> before_crops;
-  std::vector<int32_t> after_crops;
   BatchToSpaceNDOptionsT() {}
 };
 
 struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef BatchToSpaceNDOptionsT NativeTableType;
-  enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_CROPS = 6, VT_AFTER_CROPS = 8 };
-  const flatbuffers::Vector<int32_t> *block_shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_SHAPE);
-  }
-  const flatbuffers::Vector<int32_t> *before_crops() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_CROPS);
-  }
-  const flatbuffers::Vector<int32_t> *after_crops() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_CROPS);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_BLOCK_SHAPE) &&
-           verifier.Verify(block_shape()) &&
-           VerifyOffset(verifier, VT_BEFORE_CROPS) &&
-           verifier.Verify(before_crops()) &&
-           VerifyOffset(verifier, VT_AFTER_CROPS) &&
-           verifier.Verify(after_crops()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   BatchToSpaceNDOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2902,18 +2902,6 @@ struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
 struct BatchToSpaceNDOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_block_shape(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BLOCK_SHAPE, block_shape);
-  }
-  void add_before_crops(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BEFORE_CROPS, before_crops);
-  }
-  void add_after_crops(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_AFTER_CROPS, after_crops);
-  }
   explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2927,29 +2915,11 @@ struct BatchToSpaceNDOptionsBuilder {
 };
 
 inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   BatchToSpaceNDOptionsBuilder builder_(_fbb);
-  builder_.add_after_crops(after_crops);
-  builder_.add_before_crops(before_crops);
-  builder_.add_block_shape(block_shape);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BatchToSpaceNDOptions>
-CreateBatchToSpaceNDOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *block_shape = nullptr,
-    const std::vector<int32_t> *before_crops = nullptr,
-    const std::vector<int32_t> *after_crops = nullptr) {
-  return tflite::CreateBatchToSpaceNDOptions(
-      _fbb, block_shape ? _fbb.CreateVector<int32_t>(*block_shape) : 0,
-      before_crops ? _fbb.CreateVector<int32_t>(*before_crops) : 0,
-      after_crops ? _fbb.CreateVector<int32_t>(*after_crops) : 0);
-}
-
 flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -3324,19 +3294,13 @@ flatbuffers::Offset<GatherOptions> CreateGatherOptions(
 
 struct TransposeOptionsT : public flatbuffers::NativeTable {
   typedef TransposeOptions TableType;
-  std::vector<int32_t> perm;
   TransposeOptionsT() {}
 };
 
 struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef TransposeOptionsT NativeTableType;
-  enum { VT_PERM = 4 };
-  const flatbuffers::Vector<int32_t> *perm() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_PERM);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PERM) &&
-           verifier.Verify(perm()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   TransposeOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3351,9 +3315,6 @@ struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct TransposeOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_perm(flatbuffers::Offset<flatbuffers::Vector<int32_t>> perm) {
-    fbb_.AddOffset(TransposeOptions::VT_PERM, perm);
-  }
   explicit TransposeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3367,41 +3328,27 @@ struct TransposeOptionsBuilder {
 };
 
 inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> perm = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   TransposeOptionsBuilder builder_(_fbb);
-  builder_.add_perm(perm);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *perm = nullptr) {
-  return tflite::CreateTransposeOptions(
-      _fbb, perm ? _fbb.CreateVector<int32_t>(*perm) : 0);
-}
-
 flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct MeanOptionsT : public flatbuffers::NativeTable {
   typedef MeanOptions TableType;
-  std::vector<int32_t> axis;
   bool keep_dims;
   MeanOptionsT() : keep_dims(false) {}
 };
 
 struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef MeanOptionsT NativeTableType;
-  enum { VT_AXIS = 4, VT_KEEP_DIMS = 6 };
-  const flatbuffers::Vector<int32_t> *axis() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AXIS);
-  }
+  enum { VT_KEEP_DIMS = 4 };
   bool keep_dims() const { return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0; }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_AXIS) &&
-           verifier.Verify(axis()) &&
+    return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_KEEP_DIMS) && verifier.EndTable();
   }
   MeanOptionsT *UnPack(
@@ -3417,9 +3364,6 @@ struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct MeanOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_axis(flatbuffers::Offset<flatbuffers::Vector<int32_t>> axis) {
-    fbb_.AddOffset(MeanOptions::VT_AXIS, axis);
-  }
   void add_keep_dims(bool keep_dims) {
     fbb_.AddElement<uint8_t>(MeanOptions::VT_KEEP_DIMS,
                              static_cast<uint8_t>(keep_dims), 0);
@@ -3437,22 +3381,12 @@ struct MeanOptionsBuilder {
 };
 
 inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> axis = 0,
-    bool keep_dims = false) {
+    flatbuffers::FlatBufferBuilder &_fbb, bool keep_dims = false) {
   MeanOptionsBuilder builder_(_fbb);
-  builder_.add_axis(axis);
   builder_.add_keep_dims(keep_dims);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<MeanOptions> CreateMeanOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *axis = nullptr, bool keep_dims = false) {
-  return tflite::CreateMeanOptions(
-      _fbb, axis ? _fbb.CreateVector<int32_t>(*axis) : 0, keep_dims);
-}
-
 flatbuffers::Offset<MeanOptions> CreateMeanOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -5098,6 +5032,56 @@ inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
                                           _fused_activation_function);
 }
 
+inline BidirectionalSequenceRNNOptionsT *
+BidirectionalSequenceRNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BidirectionalSequenceRNNOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void BidirectionalSequenceRNNOptions::UnPackTo(
+    BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = time_major();
+    _o->time_major = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+BidirectionalSequenceRNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BidirectionalSequenceRNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _time_major = _o->time_major;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateBidirectionalSequenceRNNOptions(
+      _fbb, _time_major, _fused_activation_function);
+}
+
 inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(
     const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new FullyConnectedOptionsT();
@@ -5457,14 +5441,6 @@ inline void ResizeBilinearOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = new_height();
-    _o->new_height = _e;
-  };
-  {
-    auto _e = new_width();
-    _o->new_width = _e;
-  };
 }
 
 inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(
@@ -5484,9 +5460,7 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _new_height = _o->new_height;
-  auto _new_width = _o->new_width;
-  return tflite::CreateResizeBilinearOptions(_fbb, _new_height, _new_width);
+  return tflite::CreateResizeBilinearOptions(_fbb);
 }
 
 inline CallOptionsT *CallOptions::UnPack(
@@ -5616,33 +5590,6 @@ inline void SpaceToBatchNDOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = block_shape();
-    if (_e) {
-      _o->block_shape.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->block_shape[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = before_paddings();
-    if (_e) {
-      _o->before_paddings.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->before_paddings[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = after_paddings();
-    if (_e) {
-      _o->after_paddings.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->after_paddings[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(
@@ -5662,14 +5609,7 @@ inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _block_shape =
-      _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0;
-  auto _before_paddings =
-      _o->before_paddings.size() ? _fbb.CreateVector(_o->before_paddings) : 0;
-  auto _after_paddings =
-      _o->after_paddings.size() ? _fbb.CreateVector(_o->after_paddings) : 0;
-  return tflite::CreateSpaceToBatchNDOptions(_fbb, _block_shape,
-                                             _before_paddings, _after_paddings);
+  return tflite::CreateSpaceToBatchNDOptions(_fbb);
 }
 
 inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(
@@ -5684,33 +5624,6 @@ inline void BatchToSpaceNDOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = block_shape();
-    if (_e) {
-      _o->block_shape.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->block_shape[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = before_crops();
-    if (_e) {
-      _o->before_crops.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->before_crops[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = after_crops();
-    if (_e) {
-      _o->after_crops.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->after_crops[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(
@@ -5730,14 +5643,7 @@ inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _block_shape =
-      _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0;
-  auto _before_crops =
-      _o->before_crops.size() ? _fbb.CreateVector(_o->before_crops) : 0;
-  auto _after_crops =
-      _o->after_crops.size() ? _fbb.CreateVector(_o->after_crops) : 0;
-  return tflite::CreateBatchToSpaceNDOptions(_fbb, _block_shape, _before_crops,
-                                             _after_crops);
+  return tflite::CreateBatchToSpaceNDOptions(_fbb);
 }
 
 inline SkipGramOptionsT *SkipGramOptions::UnPack(
@@ -5999,15 +5905,6 @@ inline void TransposeOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = perm();
-    if (_e) {
-      _o->perm.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->perm[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(
@@ -6027,8 +5924,7 @@ inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _perm = _o->perm.size() ? _fbb.CreateVector(_o->perm) : 0;
-  return tflite::CreateTransposeOptions(_fbb, _perm);
+  return tflite::CreateTransposeOptions(_fbb);
 }
 
 inline MeanOptionsT *MeanOptions::UnPack(
@@ -6042,15 +5938,6 @@ inline void MeanOptions::UnPackTo(
     MeanOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = axis();
-    if (_e) {
-      _o->axis.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->axis[_i] = _e->Get(_i);
-      }
-    }
-  };
   {
     auto _e = keep_dims();
     _o->keep_dims = _e;
@@ -6074,9 +5961,8 @@ inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _axis = _o->axis.size() ? _fbb.CreateVector(_o->axis) : 0;
   auto _keep_dims = _o->keep_dims;
-  return tflite::CreateMeanOptions(_fbb, _axis, _keep_dims);
+  return tflite::CreateMeanOptions(_fbb, _keep_dims);
 }
 
 inline SqueezeOptionsT *SqueezeOptions::UnPack(
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 50e8ca75f8efd600d4773b83cd2c8de11c9d13ca..7f84a0ab9bb5b9fbacba1728f8bfe4df25f13f86 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -197,7 +197,7 @@ cc_binary(
 
 tf_cc_test(
     name = "generated_examples_zip_test",
-    size = "medium",
+    size = "large",
     srcs = ["generated_examples_zip_test.cc"],
     args = [
         "--zip_files_dir=tensorflow/contrib/lite/testing/optest",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index a639351657835a1e7d17466e70277e8bf40bc0f9..b2227a7c98f6d6d65b5198637319408cfadee800 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -94,7 +94,8 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
-    # BatchToSpaceND doesn't support cropping.
+    # BatchToSpaceND doesn't support cropping. This catches test cases with
+    # const tensors as crops.
     r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
@@ -694,6 +695,7 @@ def make_mean_tests(zip_path):
           [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
           [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
       ],
+      "const_axis": [True, False],
       "keep_dims": [True, False],
   }, {
       "input_dtype": [tf.float32, tf.int32, tf.int64],
@@ -704,6 +706,7 @@ def make_mean_tests(zip_path):
           -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
           [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
       ],
+      "const_axis": [True, False],
       "keep_dims": [True, False],
   }]
 
@@ -713,17 +716,31 @@ def make_mean_tests(zip_path):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
+
+    # Get axis as either a placeholder or constants.
+    if parameters["const_axis"]:
+      axis = parameters["axis"]
+      input_tensors = [input_tensor]
+    else:
+      if isinstance(parameters["axis"], list):
+        shape = [len(parameters["axis"])]
+      else:
+        shape = [0]  # shape for None or integers.
+      axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
+      input_tensors = [input_tensor, axis]
+
     out = tf.reduce_mean(
-        input_tensor,
-        axis=parameters["axis"],
-        keep_dims=parameters["keep_dims"])
-    return [input_tensor], [out]
+        input_tensor, axis=axis, keep_dims=parameters["keep_dims"])
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["input_dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"])
+    ]
+    if not parameters["const_axis"]:
+      if parameters["axis"]:
+        values.append(np.array(parameters["axis"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1318,12 +1335,16 @@ def make_space_to_batch_nd_tests(zip_path):
           "input_shape": [[1, 2, 2, 3], [2, 2, 4, 1]],
           "block_shape": [[1, 3], [2, 2]],
           "paddings": [[[0, 0], [0, 0]], [[0, 0], [2, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
       {
           "dtype": [tf.float32],
           "input_shape": [[2, 3, 7, 3]],
           "block_shape": [[1, 3], [2, 2]],
           "paddings": [[[0, 0], [2, 0]], [[1, 0], [1, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
       # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
       {
@@ -1331,23 +1352,47 @@ def make_space_to_batch_nd_tests(zip_path):
           "input_shape": [[1, 4, 4, 4, 1, 1]],
           "block_shape": [[2, 2, 2]],
           "paddings": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
   ]
 
   def build_graph(parameters):
+    """Build a space_to_batch graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.space_to_batch_nd(input_tensor, parameters["block_shape"],
-                               parameters["paddings"])
-    return [input_tensor], [out]
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get paddings either as a const or as a placeholder (tensor).
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="paddings", shape=shape)
+      input_tensors.append(paddings)
+
+    out = tf.space_to_batch_nd(input_tensor, block_shape, paddings)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1361,6 +1406,8 @@ def make_batch_to_space_nd_tests(zip_path):
           "input_shape": [[12, 2, 2, 1]],
           "block_shape": [[1, 4], [2, 2], [3, 4]],
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
       },
       # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
       {
@@ -1368,23 +1415,47 @@ def make_batch_to_space_nd_tests(zip_path):
           "input_shape": [[8, 2, 2, 2, 1, 1]],
           "block_shape": [[2, 2, 2]],
           "crops": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
       },
   ]
 
   def build_graph(parameters):
+    """Build a batch_to_space graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.batch_to_space_nd(input_tensor, parameters["block_shape"],
-                               parameters["crops"])
-    return [input_tensor], [out]
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get crops either as a const or as a placeholder (tensor).
+    if parameters["constant_crops"]:
+      crops = parameters["crops"]
+    else:
+      shape = [len(parameters["crops"]), 2]
+      crops = tf.placeholder(dtype=tf.int32, name="crops", shape=shape)
+      input_tensors.append(crops)
+
+    out = tf.batch_to_space_nd(input_tensor, block_shape, crops)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_crops"]:
+      values.append(np.array(parameters["crops"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1397,29 +1468,44 @@ def make_transpose_tests(zip_path):
       "dtype": [tf.int32, tf.int64, tf.float32],
       "input_shape": [[2, 2, 3]],
       "perm": [[0, 1, 2], [0, 2, 1]],
+      "constant_perm": [True, False],
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 2, 3, 4]],
       "perm": [[0, 1, 2, 3], [3, 0, 1, 2]],
+      "constant_perm": [True, False],
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 2, 3, 4, 5]],
       "perm": [[0, 1, 2, 3, 4]],
+      "constant_perm": [True, False],
   }]
 
   def build_graph(parameters):
+    """Build a transpose graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.transpose(input_tensor, perm=parameters["perm"])
-    return [input_tensor], [out]
+
+    if parameters["constant_perm"]:
+      perm = parameters["perm"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["perm"]), 2]
+      perm = tf.placeholder(dtype=tf.int32, name="perm", shape=shape)
+      input_tensors = [input_tensor, perm]
+
+    out = tf.transpose(input_tensor, perm=perm)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_perm"]:
+      values.append(np.array(parameters["perm"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1474,9 +1560,11 @@ def make_strided_slice_tests(zip_path):
           "input_shape": [[12, 2, 2, 5]],
           "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
           "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
-          "strides": [None, [1, 1, 1, 1], [2, 1, 3, 1]],
-          "begin_mask": [None, 0, 1, 2, 8],
-          "end_mask": [None, 0, 1, 2, 8],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin_mask": [None, 1, 8],
+          "end_mask": [None, 1, 8],
+          "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
+          "constant_indices": [False, True],
       },
       # 2-D
       {
@@ -1485,9 +1573,11 @@ def make_strided_slice_tests(zip_path):
           "input_shape": [[2, 3]],
           "begin": [[0, 0], [1, 0]],
           "end": [[2, 3], [2, 2]],
-          "strides": [None, [1, 1], [2, 2]],
-          "begin_mask": [None, 0, 1, 2],
-          "end_mask": [None, 0, 1, 2],
+          "strides": [None, [2, 2]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False, True],
       },
       # Negative strides
       {
@@ -1497,8 +1587,10 @@ def make_strided_slice_tests(zip_path):
           "begin": [[0, -1]],
           "end": [[2, -3]],
           "strides": [[1, -1]],
-          "begin_mask": [None, 0, 1, 2],
-          "end_mask": [None, 0, 1, 2],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False],
       },
   ]
 
@@ -1508,23 +1600,29 @@ def make_strided_slice_tests(zip_path):
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    begin = tf.placeholder(
-        dtype=parameters["index_type"],
-        name="begin",
-        shape=[len(parameters["input_shape"])])
-    end = tf.placeholder(
-        dtype=parameters["index_type"],
-        name="end",
-        shape=[len(parameters["input_shape"])])
-    strides = (
-        tf.placeholder(
-            dtype=parameters["index_type"],
-            name="strides",
-            shape=[len(parameters["input_shape"])])
-        if parameters["strides"] is not None else None)
-    tensors = [input_tensor, begin, end]
-    if strides is not None:
-      tensors.append(strides)
+    if parameters["constant_indices"]:
+      begin = parameters["begin"]
+      end = parameters["end"]
+      strides = parameters["strides"]
+      tensors = [input_tensor]
+    else:
+      begin = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="begin",
+          shape=[len(parameters["input_shape"])])
+      end = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="end",
+          shape=[len(parameters["input_shape"])])
+      strides = (
+          tf.placeholder(
+              dtype=parameters["index_type"],
+              name="strides",
+              shape=[len(parameters["input_shape"])])
+          if parameters["strides"] is not None else None)
+      tensors = [input_tensor, begin, end]
+      if strides is not None:
+        tensors.append(strides)
     out = tf.strided_slice(
         input_tensor,
         begin,
@@ -1539,14 +1637,17 @@ def make_strided_slice_tests(zip_path):
     input_values = create_tensor_data(parameters["dtype"],
                                       parameters["input_shape"])
     index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
-    begin_values = np.array(parameters["begin"]).astype(index_type)
-    end_values = np.array(parameters["end"]).astype(index_type)
-    stride_values = (
-        np.array(parameters["strides"]).astype(index_type)
-        if parameters["strides"] is not None else None)
-    values = [input_values, begin_values, end_values]
-    if stride_values is not None:
-      values.append(stride_values)
+    values = [input_values]
+    if not parameters["constant_indices"]:
+      begin_values = np.array(parameters["begin"]).astype(index_type)
+      end_values = np.array(parameters["end"]).astype(index_type)
+      stride_values = (
+          np.array(parameters["strides"]).astype(index_type)
+          if parameters["strides"] is not None else None)
+      values.append(begin_values)
+      values.append(end_values)
+      if stride_values is not None:
+        values.append(stride_values)
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 41652a07d21fbf022cb66a4022706cfee02d2c09..e8b425a5925adf8c36c615f8b0c98ea20fa39a69 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -47,9 +47,7 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // Key is a substring of the test name and value is a bug number.
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
-    // Add doesn't support broadcasting.
-    {R"(^\/adda.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(^\/mula.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    // Sub and Div don't support broadcasting.
     {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
     {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
 
@@ -67,7 +65,11 @@ std::map<string, string> kBrokenTests = {
     // L2Norm only supports tensors with 4D or fewer.
     {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
-    // SpaceToBatch only supports 4D tensors.
+    // BatchToSpaceND doesn't support cropping. This catches test cases with
+    // non-const tensors as crops.
+    {R"(^\/batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\])", "70594634"},
+
+    // SpaceToBatchND only supports 4D tensors.
     {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
     // L2Norm only works for dim=-1.
@@ -92,7 +94,7 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"},
 
     // Transpose only supports 1D-4D input tensors.
-    {R"(^\/transposedtype=.*,input_shape=\[.,.,.,.,.\],perm=.*)", "71545879"},
+    {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -239,8 +241,7 @@ INSTANTIATE_TESTS(avg_pool)
 INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
-// TODO(b/71642435) re-enable this test
-// INSTANTIATE_TESTS(constant)
+INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
 INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index ad8f0e4a47e5710351133fe3d620992b97059083..20c156a93262568cf0c6c349b44fbf3d3afa5bc4 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -160,6 +160,7 @@ cc_library(
     ],
     deps = [
         # Placeholder for internal file dependency.
+        "@protobuf_archive//:protobuf_headers",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -172,6 +173,7 @@ cc_library(
         "graph_transformations/convert_expanddims_to_reshape.cc",
         "graph_transformations/convert_pure_conv_to_depthwise.cc",
         "graph_transformations/convert_reorder_axes.cc",
+        "graph_transformations/convert_trivial_addn_to_add.cc",
         "graph_transformations/convert_trivial_transpose_to_reshape.cc",
         "graph_transformations/create_im2col_arrays.cc",
         "graph_transformations/dequantize.cc",
@@ -203,6 +205,7 @@ cc_library(
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_unused_op.cc",
+        "graph_transformations/reorder_activation_functions.cc",
         "graph_transformations/resolve_batch_normalization.cc",
         "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 5961d30bf5403df7fa6228e05124479d118dd279..49cc1fc2aa365925cde86ceb658ff2b354d06911 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -158,9 +158,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
         LOG(FATAL)
             << "A RNN state array, " << array_name << ", still does not "
             << "have a known data type after all graph transformations have "
-            << "run. That's mostly a toco bug --- sorry. For now, you can "
-            << "work around this issue by adding manually_create:true in the "
-            << "--rnn_state description of this RNN state.";
+            << "run.";
       }
     }
     LOG(FATAL) << "An array, " << array_name << ", still does not "
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 8004a1a37ae48468e9bf22785ec02f8de54bf236..b97a4720a7c4e69f8b69574475d19e0522cfe86d 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -208,6 +208,7 @@ struct ParsedModelFlags {
   Arg<bool> dump_graphviz_video = Arg<bool>(false);
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
+  Arg<string> arrays_extra_info_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 4fc01dbc20272eb863b0b22d6a1ef7b27c499981..4c70b01a9d009268e47b64380b74fe9e66125737 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -519,6 +519,18 @@ void ConvertAddOperator(const Model& model, const AddOperator& src_op,
   (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertAddNOperator(const Model& model, const AddNOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* add_op = tensorflow_graph->add_node();
+  add_op->set_op("AddN");
+  add_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *add_op->add_input() = input;
+  }
+  (*add_op->mutable_attr())["N"].set_i(src_op.inputs.size());
+  (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertMulOperator(const Model& model, const MulOperator& src_op,
                         GraphDef* tensorflow_graph) {
   auto* add_op = tensorflow_graph->add_node();
@@ -609,7 +621,8 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
                             GraphDef* tensorflow_graph) {
   string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
-  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+  if (providing_op != nullptr &&
+      providing_op->type == OperatorType::kTensorFlowReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
@@ -1406,6 +1419,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kAdd) {
     ConvertAddOperator(model, static_cast<const AddOperator&>(src_op),
                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kAddN) {
+    ConvertAddNOperator(model, static_cast<const AddNOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kMul) {
     ConvertMulOperator(model, static_cast<const MulOperator&>(src_op),
                        tensorflow_graph);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcaaddbf3b5409f0fc3ddaf32e23b1e5eefb6565
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// This pass will convert an AddN operator with only 2 inputs into a regular Add
+// operator, to which more optimizations may apply.
+bool ConvertTrivialAddNToAdd::Run(Model* model, std::size_t op_index) {
+  auto addn_it = model->operators.begin() + op_index;
+  if (addn_it->get()->type != OperatorType::kAddN) {
+    return false;
+  }
+  AddNOperator* addn_op = static_cast<AddNOperator*>(addn_it->get());
+  CHECK_GE(addn_op->inputs.size(), 2);
+  CHECK_EQ(addn_op->outputs.size(), 1);
+
+  // We only reduce AddN with N=2 to a regular Add.
+  if (addn_op->inputs.size() != 2) {
+    return false;
+  }
+
+  // Copy inputs & outputs to regular Add.
+  auto* add_op = new AddOperator;
+  add_op->inputs.push_back(addn_op->inputs[0]);
+  add_op->inputs.push_back(addn_op->inputs[1]);
+  add_op->outputs = addn_op->outputs;
+
+  // Replace the AddN operator in the graph.
+  const auto add_it = model->operators.emplace(addn_it, add_op);
+  addn_it = add_it + 1;
+  CHECK_EQ(addn_it->get(), addn_op);
+  model->operators.erase(addn_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index 88e59664ec427841df6f20686238feacef6a47e9..ab943f72d1dd87ae9ff4bd53a807cd4923a88c38 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -68,12 +68,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // TODO(b/72172404): Great many ops don't support activation function
-  // fusing. Switch to a categorizing function instead.
-  if (op->type == OperatorType::kConcatenation ||
-      op->type == OperatorType::kSlice ||
-      op->type == OperatorType::kTensorFlowReshape ||
-      op->type == OperatorType::kTensorFlowSplit) {
+  if (!OperatorSupportsFusedActivation(op->type)) {
     AddMessageF(
         "Not fusing activation function because the %s op doesn't support it",
         LogName(*op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 4ac2265be98bc307dd47f3d22040a6f17893c4f1..cf90ebe99697fe8a40b4c707e70fdc5318123854 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -114,6 +114,7 @@ void RunGraphTransformations(Model* model, const string& message,
 // List of all graph transformations
 DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialAddNToAdd)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTransposeToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
@@ -143,6 +144,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
+DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index 082820fddcf137238867239bbc4d4eed8158e307..c363b93394f0af7bcfc37c1e8be5f98aca6667ae 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/string_view.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -202,23 +201,6 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
   return true;
 }
 
-absl::string_view FindLongestCommonPrefix(absl::string_view a,
-                                          absl::string_view b) {
-  if (a.empty() || b.empty()) return absl::string_view();
-
-  const char* pa = a.data();
-  const char* pb = b.data();
-  size_t count = 0;
-  const ssize_t limit = std::min(a.size(), b.size());
-  while (count < limit && *pa == *pb) {
-    ++pa;
-    ++pb;
-    ++count;
-  }
-
-  return absl::string_view(a.data(), count);
-}
-
 }  // namespace
 
 bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index ff0a3bd8819dd6f3413cac11b0a64b727a37bd3d..7f26884bc150b6f2eb81058af29d40c7749882a3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -406,6 +406,28 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
                                   &output_array);
 }
 
+void ProcessAddNOperator(Model* model, Operator* op) {
+  // Yield until all input dims have been resolved.
+  //
+  // TODO(myenik): Since AddN does not support broadcasting, maybe we could
+  // actually use this to improve shape propagation by propagating the shape of
+  // one input to all other inputs once it is resolved instead of just the
+  // output, since all inputs must be the same size and shape for a well-formed
+  // graph.
+  for (const auto& input : op->inputs) {
+    const auto& input_array = model->GetArray(input);
+    if (!input_array.has_shape()) {
+      return;
+    }
+  }
+
+  // AddN does not support broadcasting, all inputs must be the same shape, so
+  // we just take the first input shape and apply it to the output.
+  const auto& input0_array = model->GetArray(op->inputs[0]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  output_array.copy_shape(input0_array.shape());
+}
+
 bool KeepDims(const Operator& op) {
   switch (op.type) {
     case OperatorType::kTensorFlowMin:
@@ -1098,7 +1120,8 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
       stop += input_array.shape().dims(i);
     }
 
-    int dim_size = (stop - start) / op->strides[i];
+    int dim_size = ceil((stop - start) / static_cast<float>(op->strides[i]));
+    dim_size = dim_size < 0 ? 0 : dim_size;
     if (op->shrink_axis_mask & mask) {
       CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when "
                                "shrinking that axis";
@@ -1282,6 +1305,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTensorFlowGreaterEqual:
       ProcessSimpleBinaryOperator(model, op);
       break;
+    case OperatorType::kAddN:
+      ProcessAddNOperator(model, op);
+      break;
     case OperatorType::kConv:
       ProcessConvOperator(model, static_cast<ConvOperator*>(op));
       break;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
index 8512e6bb5ada41766a0ab6a4c06de060b898b1b4..95a50c61794092b02e518d1f08d8cf4a668353a8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -89,14 +89,14 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   const auto& constant_input_float_data =
       constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
   bool is_trivial = false;
-  if (binary_op->type != OperatorType::kAdd) {
+  if (binary_op->type == OperatorType::kAdd) {
     is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
-  } else if (binary_op->type != OperatorType::kSub) {
+  } else if (binary_op->type == OperatorType::kSub) {
     is_trivial = index_of_constant_input == 1 &&
                  AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
-  } else if (binary_op->type != OperatorType::kMul) {
+  } else if (binary_op->type == OperatorType::kMul) {
     is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
-  } else if (binary_op->type != OperatorType::kDiv) {
+  } else if (binary_op->type == OperatorType::kDiv) {
     is_trivial = index_of_constant_input == 1 &&
                  AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cabbc4d313be3069053f056eb0de45c37ba2e7a4
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
+  const auto ac_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& ac_op = *ac_it;
+  DCHECK(ac_op);
+
+  if (ac_op->type != OperatorType::kRelu6 &&
+      ac_op->type != OperatorType::kRelu1 &&
+      ac_op->type != OperatorType::kRelu) {
+    return false;
+  }
+
+  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
+  if (exchange_it == model->operators.end()) return false;
+  // Find the op producing the array passed to this activation function
+  std::unique_ptr<Operator>& exchange_op = *exchange_it;
+  DCHECK(exchange_op);
+
+  if (exchange_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+
+  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
+  const auto& exchange_op_input = exchange_op->inputs[0];
+  const auto& intermediate_array = exchange_op->outputs[0];
+  const auto& ac_op_output = ac_op->outputs[0];
+
+  int count_ops_consuming_output =
+      CountOpsWithInput(*model, intermediate_array);
+  DCHECK_GE(count_ops_consuming_output, 1);
+  if (count_ops_consuming_output > 1) {
+    AddMessageF(
+        "Not exchanging activation function with %s because it is consumed by "
+        "more than 1 other operator",
+        LogName(*exchange_op));
+    return false;
+  }
+
+  // Rewire by changing inputs, including all consumers.
+  Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
+  while (consumer) {
+    for (int i = 0; i < consumer->inputs.size(); ++i) {
+      if (consumer->inputs[i] == ac_op_output) {
+        consumer->inputs[i] = intermediate_array;
+      }
+    }
+    consumer = GetFirstOpWithInput(*model, ac_op_output);
+  }
+  ac_op->inputs[0] = exchange_op_input;
+  exchange_op->inputs[0] = ac_op_output;
+
+  // Finally, reorder operators.  Note that this only works when there are no
+  // other direct descendents of the exchange_op.
+  ac_op.swap(exchange_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index e8f318cd4329272e2a76e61e1d8d07027fcd2408..9862dbe99d5c2d1e67718a43389ce905bd8eeb2e 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -173,7 +173,8 @@ void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_float_data =
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
-  output_float_data.resize(input_flat_size);
+  output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
+                           0.f);
   if (input_tensor.float_val_size() == 1) {
     for (int i = 0; i < input_flat_size; i++) {
       output_float_data[i] = input_tensor.float_val(0);
@@ -203,7 +204,7 @@ void ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -229,7 +230,7 @@ void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -255,7 +256,7 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int64_val_size()) {
     for (int i = 0; i < input_tensor.int64_val_size(); i++) {
       output_int_data[i] = input_tensor.int64_val(i);
@@ -281,7 +282,7 @@ void ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_string_data =
       output_array->GetMutableBuffer<ArrayDataType::kString>().data;
-  output_string_data.resize(input_flat_size);
+  output_string_data.resize(RequiredBufferSizeForShape(output_array->shape()));
   if (input_flat_size != input_tensor.string_val_size()) {
     LOG(FATAL) << "Input_content string_val doesn't have the right "
                   "dimensions for this string tensor.";
@@ -696,6 +697,19 @@ void ConvertAddOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertAddNOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "AddN");
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  auto* op = new AddNOperator;
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertMulOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
@@ -1862,6 +1876,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertSquareOperator(node, tf_import_flags, model);
     } else if (node.op() == "Add") {
       ConvertAddOperator(node, tf_import_flags, model);
+    } else if (node.op() == "AddN") {
+      ConvertAddNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Mul") {
       ConvertMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "Sub") {
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index b1b9b718bb1da6133d4f761db2ad1453feac84f3..6fba8f2629f785ffeb3ae37b80ec1d24c29d9d56 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 #define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 
+#include <functional>
 #include <initializer_list>
 #include <memory>
 #include <string>
@@ -32,6 +33,7 @@ enum class OperatorType {
   kNone,
   // General-purpose neural network operators.
   kAdd,
+  kAddN,
   kAveragePool,
   kBatchNormalization,
   kConv,
@@ -559,6 +561,16 @@ struct AddOperator : Operator {
   AddOperator() : Operator(OperatorType::kAdd) {}
 };
 
+// Element-wise addition operator for N inputs.
+//
+// Inputs:
+//   inputs[i]: The i-th array to add together to form the output.
+//
+// TensorFlow equivalent: AddN
+struct AddNOperator : Operator {
+  AddNOperator() : Operator(OperatorType::kAddN) {}
+};
+
 // Concatenation operator: concatenates its inputs
 // along the axis.
 //
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 790b3443cef1c577e19bafc5e087ca42e6fce60a..4e2dec15a534607ef9207149a2e6061069eabcb1 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -148,6 +148,12 @@ bool ParseModelFlagsFromCommandLineFlags(
            "ranging from 32 to 127. This is disallowed by default so as to "
            "catch common copy-and-paste issues where invisible unicode "
            "characters are unwittingly added to these strings."),
+      Flag(
+          "arrays_extra_info_file", parsed_flags.arrays_extra_info_file.bind(),
+          parsed_flags.arrays_extra_info_file.default_value(),
+          "Path to an optional file containing a serialized ArraysExtraInfo "
+          "proto allowing to pass extra information about arrays not specified "
+          "in the input model file, such as extra MinMax information."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -327,9 +333,6 @@ void ReadModelFlagsFromCommandLineFlags(
         CHECK(absl::SimpleAtoi(value, &size));
         CHECK_GT(size, 0);
         rnn_state_proto->set_size(size);
-      } else if (key == "manually_create") {
-        CHECK_EQ(absl::AsciiStrToLower(value), "true");
-        rnn_state_proto->set_manually_create(true);
       } else {
         LOG(FATAL) << "Unknown key '" << key << "' in --rnn_states";
       }
@@ -368,6 +371,15 @@ void ReadModelFlagsFromCommandLineFlags(
       parsed_model_flags.allow_nonascii_arrays.value());
   model_flags->set_allow_nonexistent_arrays(
       parsed_model_flags.allow_nonexistent_arrays.value());
+
+  if (parsed_model_flags.arrays_extra_info_file.specified()) {
+    string arrays_extra_info_file_contents;
+    port::file::GetContents(parsed_model_flags.arrays_extra_info_file.value(),
+                            &arrays_extra_info_file_contents,
+                            port::file::Defaults());
+    ParseFromStringEitherTextOrBinary(arrays_extra_info_file_contents,
+                                      model_flags->mutable_arrays_extra_info());
+  }
 }
 
 ParsedModelFlags* UncheckedGlobalParsedModelFlags(bool must_already_exist) {
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 13fea29a07ed9ea75ebe1b9b046f2a68d814c649..e4b39b34e85e4d703c1b41cb68f8139abd1f6279 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -81,19 +81,26 @@ message RnnState {
   optional string state_array = 1;
   optional string back_edge_source_array = 2;
   optional bool discardable = 5;
-  // TODO(benoitjacob): drop the 'size' field. Should be redundant with
-  // --input_shapes and shapes propagation.
+  // size allows to specify a 1-D shape for the RNN state array.
+  // Will be expanded with 1's to fit the model.
+  // TODO(benoitjacob): should allow a generic, explicit shape.
   optional int32 size = 3;
-  // TODO(benoitjacob): manually_create is a temporary hack:
-  // due to discrepancies between the current toco dims tracking and
-  // TensorFlow shapes, for some models we need to manually create RNN state
-  // arrays with a specified shape.
-  // Maybe we should actually implement back-edges as operators of their own,
-  // which would remove the need for much special-casing, including here,
-  // we could probably consistently let PropagateFixedSizes handle state
-  // arrays.
-  // TODO(benoitjacob): should really drop manually_create now.
-  optional bool manually_create = 4;
+}
+
+// An ArraysExtraInfo message stores a collection of additional Information
+// about arrays in a model, complementing the information in the model itself.
+// It is intentionally a separate message so that it may be serialized and
+// passed separately from the model. See --arrays_extra_info_file.
+//
+// A typical use case is to manually specify MinMax for specific arrays in a
+// model that does not already contain such MinMax information.
+message ArraysExtraInfo {
+  message Entry {
+    optional string name = 1;
+    optional float min = 2;
+    optional float max = 3;
+  }
+  repeated Entry entries = 1;
 }
 
 // ModelFlags encodes properties of a model that, depending on the file
@@ -117,7 +124,7 @@ message RnnState {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 18.
+// Next ID to USE: 19.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -160,4 +167,8 @@ message ModelFlags {
   // catch common copy-and-paste issues where invisible unicode
   // characters are unwittingly added to these strings.
   optional bool allow_nonascii_arrays = 17;
+
+  // If set, this ArraysExtraInfo allows to pass extra information about arrays
+  // not specified in the input model file, such as extra MinMax information.
+  optional ArraysExtraInfo arrays_extra_info = 18;
 }
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.cc b/tensorflow/contrib/lite/toco/tensorflow_util.cc
index 82e2800ca2f5bb017f91b5bf43d8d3cd05e97b83..0e7e9c41a066581b14fe1b78f83d8d57b916be6c 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.cc
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.cc
@@ -51,7 +51,8 @@ void LogDumpGraphDef(int log_level, const string& message,
 BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s)
 There are %d nodes.
 There are %zu different op types:
-)MSG", message, tf_graph.node_size(), ops.size());
+)MSG",
+                      message, tf_graph.node_size(), ops.size());
   for (const auto& op : ops) {
     toco::port::AppendF(&dump, "  %s\n", op);
   }
@@ -63,7 +64,8 @@ PROTO DUMP
 BEGIN NODE: name = %s
   op = %s
   inputs = [
-)MSG", node.name(), node.op());
+)MSG",
+                        node.name(), node.op());
     for (const auto& input : node.input()) {
       toco::port::AppendF(&dump, "    %s\n", input);
     }
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index 72c926656449da981abf6c11c03cd7c00a634ce7..a2b8145a67278c3ac0065f9551da6ffd1de60772 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -117,6 +117,7 @@ cc_library(
         ":types",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 391ef87029d019ab52af2716f72883f5f82f94d9..27719599708a7eb14f72a82f8e5d76b3b8af9dc4 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -26,6 +26,9 @@ namespace toco {
 
 namespace tflite {
 
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
 using ::tflite::Buffer;
 using ::tflite::BuiltinOperator;
 using ::tflite::BuiltinOperator_CUSTOM;
@@ -39,9 +42,6 @@ using ::tflite::Operator;
 using ::tflite::OperatorCode;
 using ::tflite::SubGraph;
 using ::tflite::Tensor;
-using flatbuffers::FlatBufferBuilder;
-using flatbuffers::Offset;
-using flatbuffers::Vector;
 
 namespace {
 
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
index bbf201fd288140d990b8f739adcd9244e1196072..5b1ab514b23248cd98e66847185d0e8b9fe2d6aa 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -119,8 +120,16 @@ void ImportOperators(
     auto inputs = input_op->inputs();
     for (int i = 0; i < inputs->Length(); i++) {
       auto input_index = inputs->Get(i);
-      const string& input_name = tensors_table.at(input_index);
-      op->inputs.push_back(input_name);
+      // input_index == -1 indicates optional tensor.
+      if (input_index != -1) {
+        const string& input_name = tensors_table.at(input_index);
+        op->inputs.push_back(input_name);
+      } else {
+        const string& tensor_name =
+            toco::AvailableArrayName(*model, "OptionalTensor");
+        model->CreateOptionalArray(tensor_name);
+        op->inputs.push_back(tensor_name);
+      }
     }
     auto outputs = input_op->outputs();
     for (int i = 0; i < outputs->Length(); i++) {
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index d75d1fcc5b0c2678760d4b70d99682cffe56f216..461494fd99e8d09b3b9fa0add87be64a89faee26 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -140,25 +140,11 @@ class SpaceToBatchND
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto block_shape = builder->CreateVector(op.block_shape);
-    auto before_paddings = builder->CreateVector(op.before_paddings);
-    auto after_paddings = builder->CreateVector(op.after_paddings);
-    return ::tflite::CreateSpaceToBatchNDOptions(
-        *builder, block_shape, before_paddings, after_paddings);
+    return ::tflite::CreateSpaceToBatchNDOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-    op->block_shape.insert(op->block_shape.end(),
-                           options.block_shape()->begin(),
-                           options.block_shape()->end());
-    op->before_paddings.insert(op->before_paddings.end(),
-                               options.before_paddings()->begin(),
-                               options.before_paddings()->end());
-    op->after_paddings.insert(op->after_paddings.end(),
-                              options.after_paddings()->begin(),
-                              options.after_paddings()->end());
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
@@ -211,25 +197,11 @@ class BatchToSpaceND
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto block_shape = builder->CreateVector(op.block_shape);
-    auto before_crops = builder->CreateVector(op.before_crops);
-    auto after_crops = builder->CreateVector(op.after_crops);
-    return ::tflite::CreateBatchToSpaceNDOptions(*builder, block_shape,
-                                                 before_crops, after_crops);
+    return ::tflite::CreateBatchToSpaceNDOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-    op->block_shape.insert(op->block_shape.end(),
-                           options.block_shape()->begin(),
-                           options.block_shape()->end());
-    op->before_crops.insert(op->before_crops.end(),
-                            options.before_crops()->begin(),
-                            options.before_crops()->end());
-    op->after_crops.insert(op->after_crops.end(),
-                           options.after_crops()->begin(),
-                           options.after_crops()->end());
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Cast : public CustomOperator<CastOperator> {
@@ -478,8 +450,7 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Reshape
@@ -546,15 +517,11 @@ class Transpose
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    return ::tflite::CreateTransposeOptions(*builder,
-                                            builder->CreateVector(op.perm));
+    return ::tflite::CreateTransposeOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-    op->perm.insert(op->perm.end(), options.perm()->begin(),
-                    options.perm()->end());
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
@@ -564,14 +531,11 @@ class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto axis = builder->CreateVector(op.axis);
-    return ::tflite::CreateMeanOptions(*builder, axis, op.keep_dims);
+    return ::tflite::CreateMeanOptions(*builder, op.keep_dims);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->axis.insert(op->axis.end(), options.axis()->begin(),
-                    options.axis()->end());
     op->keep_dims = options.keep_dims();
   }
 };
@@ -807,6 +771,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
 
   // There operators are supported by Toco, but not by TF Lite, and has no
   // attributes.
+  ops.emplace_back(
+      new SimpleOperator<AddNOperator>("ADDN", OperatorType::kAddN));
   ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
   ops.emplace_back(new SimpleOperator<TensorFlowRsqrtOperator>(
       "RSQRT", OperatorType::kTensorFlowRsqrt));
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 9036a16d1c928702a71ccbe3fdad826fb037fcaf..6daa296282e381b8ff22cf75eb5e8fb87c68be9e 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -119,40 +119,12 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
-TEST_F(OperatorTest, BuiltinSpaceToBatchND) {
-  SpaceToBatchNDOperator op;
-  op.block_shape = {2, 2};
-  op.before_paddings = {1, 2};
-  op.after_paddings = {3, 4};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("SPACE_TO_BATCH_ND", OperatorType::kSpaceToBatchND), op);
-  EXPECT_EQ(op.block_shape, output_toco_op->block_shape);
-  EXPECT_EQ(op.before_paddings, output_toco_op->before_paddings);
-  EXPECT_EQ(op.after_paddings, output_toco_op->after_paddings);
-}
-
-TEST_F(OperatorTest, BuiltinBatchToSpaceND) {
-  BatchToSpaceNDOperator op;
-  op.block_shape = {2, 2};
-  op.before_crops = {1, 2};
-  op.after_crops = {3, 4};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("BATCH_TO_SPACE_ND", OperatorType::kBatchToSpaceND), op);
-  EXPECT_EQ(op.block_shape, output_toco_op->block_shape);
-  EXPECT_EQ(op.before_crops, output_toco_op->before_crops);
-  EXPECT_EQ(op.after_crops, output_toco_op->after_crops);
-}
-
 TEST_F(OperatorTest, BuiltinMean) {
   MeanOperator op;
-  op.axis = {1, 2};
   op.keep_dims = false;
 
   auto output_toco_op =
       SerializeAndDeserialize(GetOperator("MEAN", OperatorType::kMean), op);
-  EXPECT_EQ(op.axis, output_toco_op->axis);
   EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
 }
 
@@ -370,15 +342,6 @@ TEST_F(OperatorTest, Svdf) {
   EXPECT_EQ(op.rank, output_toco_op->rank);
 }
 
-TEST_F(OperatorTest, Transpose) {
-  TransposeOperator op;
-  op.perm = {0, 1, 2, 3};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("TRANSPOSE", OperatorType::kTranspose), op);
-  EXPECT_EQ(op.perm, output_toco_op->perm);
-}
-
 TEST_F(OperatorTest, Squeeze) {
   SqueezeOperator op;
   op.squeeze_dims = {-2, -3, 4, 1, 4};
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index f8281f3a5725283d472e5e1a36e4d904b4dc1c49..c5a62fdb620ee7d6b7195f6e8e2bc3cb208feb10 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "For Protobuf formats, the binary format will be used."),
       Flag("input_format", parsed_flags.input_format.bind(),
            parsed_flags.input_format.default_value(),
-           "Input file format. One of: tensorflow_graphdef, "),
+           "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
       Flag("output_format", parsed_flags.output_format.bind(),
-           parsed_flags.output_format.default_value(), "Output file format."),
+           parsed_flags.output_format.default_value(),
+           "Output file format. "
+           "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
@@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
-           "this may be overridden by inference_input_type)."),
+           "this may be overridden by inference_input_type). "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
            parsed_flags.inference_input_type.default_value(),
-           "Target data type of input arrays. If not specified, inference_type "
-           "is used."),
+           "Target data type of input arrays. "
+           "If not specified, inference_type is used. "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("input_type", parsed_flags.input_type.bind(),
            parsed_flags.input_type.default_value(),
            "Deprecated ambiguous flag that set both --input_data_types and "
@@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
 
       Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
            parsed_flags.drop_fake_quant.default_value(),
-           "Ignore and discard FakeQuant nodes. For instance, that can be used "
-           "to "
+           "Ignore and discard FakeQuant nodes. For instance, to "
            "generate plain float code without fake-quantization from a "
-           "quantized "
-           "graph."),
+           "quantized graph."),
       Flag(
           "reorder_across_fake_quant",
           parsed_flags.reorder_across_fake_quant.bind(),
           parsed_flags.reorder_across_fake_quant.default_value(),
           "Normally, FakeQuant nodes must be strict boundaries for graph "
           "transformations, in order to ensure that quantized inference has "
-          "the "
-          "exact same arithmetic behavior as quantized training --- which is "
-          "the "
-          "whole point of quantized training and of FakeQuant nodes in the "
-          "first "
-          "place. However, that entails subtle requirements on where exactly "
+          "the exact same arithmetic behavior as quantized training --- which "
+          "is the whole point of quantized training and of FakeQuant nodes in "
+          "the first place. "
+          "However, that entails subtle requirements on where exactly "
           "FakeQuant nodes must be placed in the graph. Some quantized graphs "
           "have FakeQuant nodes at unexpected locations, that prevent graph "
           "transformations that are necessary in order to generate inference "
           "code for these graphs. Such graphs should be fixed, but as a "
           "temporary work-around, setting this reorder_across_fake_quant flag "
-          "allows toco to perform necessary graph transformaitons on them, "
+          "allows TOCO to perform necessary graph transformaitons on them, "
           "at the cost of no longer faithfully matching inference and training "
           "arithmetic."),
       Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
            parsed_flags.allow_custom_ops.default_value(),
-           "If true, allow TOCO to create TF Lite Custom operators for all the"
-           "unsupported Tensorflow ops."),
+           "If true, allow TOCO to create TF Lite Custom operators for all the "
+           "unsupported TensorFlow ops."),
       Flag(
           "drop_control_dependency",
           parsed_flags.drop_control_dependency.bind(),
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 0572848cb5a998457cd669a2b0bce5fe8a0e15a2..4be3b5a0bf00ed204a1218545d9e66f7685a50d7 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -19,6 +19,7 @@ limitations under the License.
 // can build and use on google internal environments and on OSX.
 
 #include <string>
+#include "google/protobuf/text_format.h"
 #include "tensorflow/contrib/lite/toco/format_port.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
@@ -75,6 +76,26 @@ void CopyToBuffer(const ::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const string& src, char* dest);
 }  // namespace port
+
+inline bool ParseFromStringOverload(const std::string& in,
+                                    TFLITE_PROTO_NS::Message* proto) {
+  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
+}
+
+template <typename Proto>
+bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
+                                       Proto* proto) {
+  if (proto->ParseFromString(input_file_contents)) {
+    return true;
+  }
+
+  if (ParseFromStringOverload(input_file_contents, proto)) {
+    return true;
+  }
+
+  return false;
+}
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index f2753c84e92705e1d3d2482fb7c55cc6019d57f2..b715881774bf00f6cf2a50452a5b5c59c647ade6 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -52,6 +52,7 @@ void MakeGeneralGraphTransformationsSet(
     GraphTransformationsSet* transformations) {
   CHECK(transformations->empty());
   transformations->Add(new ConvertExpandDimsToReshape);
+  transformations->Add(new ConvertTrivialAddNToAdd);
   transformations->Add(new ConvertTrivialTransposeToReshape);
   transformations->Add(new ConvertReorderAxes);
   transformations->Add(new ResolveReshapeAttributes);
@@ -67,6 +68,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
+  transformations->Add(new ReorderActivationFunctions);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
@@ -192,6 +194,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   }
 
   SetFinalDataTypeOnInputs(toco_flags, model);
+  UseArraysExtraInfo(model);
 
   // Remove unused ops before performing any other optimizations. This is to
   // stop optimizations from crossing the input/output boundaries. For example
@@ -231,6 +234,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   transformations.Add(new ResolveConstantConcatenation);
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
+
   if (quantize_output) {
     RunGraphTransformations(model, "pre-quantization graph transformations",
                             {new HardcodeMinMax, new DropFakeQuant});
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 69187bb14282d4d7d140994f320329766bce7a82..d0bf66c542bfde849018d6302e900eb45c1cfdd1 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -33,6 +33,24 @@ limitations under the License.
 
 namespace toco {
 
+// Find the longest common prefix of two strings.
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b) {
+  if (a.empty() || b.empty()) return absl::string_view();
+
+  const char* pa = a.data();
+  const char* pb = b.data();
+  size_t count = 0;
+  const size_t limit = std::min(a.size(), b.size());
+  while (count < limit && *pa == *pb) {
+    ++pa;
+    ++pb;
+    ++count;
+  }
+
+  return absl::string_view(a.data(), count);
+}
+
 string LogName(const Operator& op) {
   const string& opname = HelpfulOperatorTypeName(op);
   if (op.outputs.empty()) {
@@ -197,6 +215,7 @@ const char* OperatorTypeName(OperatorType type) {
   case OperatorType::k##c:              \
     return #c;
     HANDLE_OPERATORTYPENAME_CASE(Add)
+    HANDLE_OPERATORTYPENAME_CASE(AddN)
     HANDLE_OPERATORTYPENAME_CASE(AveragePool)
     HANDLE_OPERATORTYPENAME_CASE(BatchNormalization)
     HANDLE_OPERATORTYPENAME_CASE(Conv)
@@ -285,6 +304,19 @@ string HelpfulOperatorTypeName(const Operator& op) {
   return OperatorTypeName(op.type);
 }
 
+bool OperatorSupportsFusedActivation(OperatorType type) {
+  switch (type) {
+    case OperatorType::kConcatenation:
+    case OperatorType::kSlice:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTensorFlowSplit:
+      return false;
+    default:
+      return true;
+  }
+}
+
 void LogSummary(int log_level, const Model& model) {
   VLOG(log_level) << "Operators summary (" << model.operators.size()
                   << " operators):";
@@ -960,7 +992,9 @@ void CheckModelCounts(const Model& model) {
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims) {
   CHECK(out_dims->empty());
-  if (num_dims == 1) {
+  if (num_dims == 0) {
+    return;
+  } else if (num_dims == 1) {
     CHECK_EQ(batch, 1);
     *out_dims = {depth};
   } else if (num_dims == 2) {
@@ -992,13 +1026,13 @@ void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
   if (array.has_shape()) {
     num_dims = array.shape().dimensions_count();
   }
-  std::vector<int> dims;
-  MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
   CHECK(array.data_type == ArrayDataType::kFloat ||
         array.data_type == ArrayDataType::kNone);
   array.data_type = ArrayDataType::kFloat;
-  if (!array.has_shape()) {
+  if (!array.has_shape() && num_dims >= 0) {
     Shape* shape = array.mutable_shape();
+    std::vector<int> dims;
+    MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
     *shape->mutable_dims() = dims;
   }
 }
@@ -1187,9 +1221,6 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   }
   // Creation of the RNN state arrays
   for (const auto& rnn_state : model->flags.rnn_states()) {
-    if (!rnn_state.manually_create()) {
-      continue;
-    }
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
                                model);
   }
@@ -1203,6 +1234,9 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
   model->flags.set_allow_nonexistent_arrays(
       model_flags.allow_nonexistent_arrays());
+
+  CHECK(!model->flags.has_arrays_extra_info());
+  *model->flags.mutable_arrays_extra_info() = model_flags.arrays_extra_info();
 }
 
 void CheckIsReadyForQuantization(const Model& model) {
@@ -1314,13 +1348,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
 }
 
 string AvailableArrayName(const Model& model, const string& name) {
-  if (!model.HasArray(name) && !model.optional_arrays.count(name)) {
+  if (!model.HasArray(name) && !model.IsOptionalArray(name)) {
     return name;
   }
   const int kNumSuffixesToTry = 1000;
   for (int i = 0; i < kNumSuffixesToTry; i++) {
     const string& name_with_suffix = toco::port::StringF("%s_%d", name, i);
-    if (!model.HasArray(name_with_suffix)) {
+    if (!model.HasArray(name_with_suffix) &&
+        !model.IsOptionalArray(name_with_suffix)) {
       return name_with_suffix;
     }
   }
@@ -1399,6 +1434,16 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
         total += RequiredBufferSizeForShape(output_array.shape());
         break;
       }
+      case OperatorType::kAddN: {
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        // AddN cost is roughly the same cost as N-1 Adds.
+        const int num_adds = op->inputs.size() - 1;
+        total += num_adds * RequiredBufferSizeForShape(output_array.shape());
+        break;
+      }
       case OperatorType::kLogistic:
       case OperatorType::kSoftmax:
       case OperatorType::kTanh: {
@@ -1704,4 +1749,15 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
   }
 }
 
+void UseArraysExtraInfo(Model* model) {
+  for (const auto& entry : model->flags.arrays_extra_info().entries()) {
+    QCHECK(model->HasArray(entry.name()))
+        << "ArraysExtraInfo refers to non-existent array name: "
+        << entry.name();
+    auto& minmax = model->GetArray(entry.name()).GetOrCreateMinMax();
+    minmax.min = entry.min();
+    minmax.max = entry.max();
+  }
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 5986d6364939e0f01b057ce3fb653b19fe8040cd..a7e77a02eb1f78bc2d4841d1bccdc9eac3e31a95 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "google/protobuf/text_format.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #if TOCO_SUPPORT_PORTABLE_PROTOS
 #include "third_party/protobuf/src/google/protobuf/text_format.h"
@@ -50,6 +50,8 @@ namespace toco {
 constexpr int kLogLevelModelChanged = 1;
 constexpr int kLogLevelModelUnchanged = 2;
 
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b);
 string LogName(const Operator& op);
 
 bool IsInputArray(const Model& model, const string& name);
@@ -80,29 +82,12 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
 const char* OperatorTypeName(OperatorType type);
 string HelpfulOperatorTypeName(const Operator& op);
 
+bool OperatorSupportsFusedActivation(OperatorType type);
+
 void DumpGraphvizVideoFrame(const Model& model);
 void LogDump(int log_level, const string& message, const Model& model);
 void LogSummary(int log_level, const string& message, const Model& model);
 
-inline bool ParseFromStringOverload(const std::string& in,
-                                    TFLITE_PROTO_NS::Message* proto) {
-  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
-}
-
-template <typename Proto>
-bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
-                                       Proto* proto) {
-  if (proto->ParseFromString(input_file_contents)) {
-    return true;
-  }
-
-  if (ParseFromStringOverload(input_file_contents, proto)) {
-    return true;
-  }
-
-  return false;
-}
-
 // TODO(b/36075966): Clean up when dims superseded by array shape.
 void ExtendShape(Shape* shape, int new_shape_size);
 
@@ -298,6 +283,8 @@ void CheckFinalDataTypesSatisfied(const Model& model);
 
 ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 
+void UseArraysExtraInfo(Model* model);
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 20df905270b0692e2bc9b78fc020447108282d01..4d3b553b22788240805a1792de20f3b4d492749b 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -93,3 +93,32 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+cc_library(
+    name = "verifier",
+    srcs = ["verifier.cc"],
+    hdrs = ["verifier.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_test(
+    name = "verifier_test",
+    size = "small",
+    srcs = ["verifier_test.cc"],
+    deps = [
+        ":verifier",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..726e2aaa3162591593cd2abd6384eb55baf0aef4
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -0,0 +1,207 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/verifier.h"
+#include <climits>
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace tflite {
+
+namespace {
+
+// Reports error message when the reporter is set.
+void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
+  if (error_reporter) {
+    va_list args;
+    va_start(args, format);
+    error_reporter->Report(format, args);
+    va_end(args);
+  }
+}
+
+// Returns the int32_t value pointed by ptr.
+const uint32_t* GetIntPtr(const char* ptr) {
+  return reinterpret_cast<const uint32_t*>(ptr);
+}
+
+// Verifies flatbuffer format of the model contents and returns the in-memory
+// model.
+const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
+  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  if (VerifyModelBuffer(verifier)) {
+    return ::tflite::GetModel(buf);
+  } else {
+    return nullptr;
+  }
+}
+
+const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
+
+// Verifies string tensor has legit buffer contents that follow the schema
+// defined in lite/string_util.h
+bool VerifyStringTensorBuffer(const Buffer& buffer,
+                              ErrorReporter* error_reporter) {
+  uint32_t buffer_size = buffer.data()->size();
+  const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
+
+  uint32_t num_strings = *GetIntPtr(buffer_ptr);
+  if (num_strings > kMaxNumString) {
+    ReportError(error_reporter,
+                "String tensor has invalid num of string set: %d", num_strings);
+    return false;
+  }
+  uint32_t header_offsets =
+      static_cast<uint32_t>(num_strings + 2) * sizeof(int32_t);
+
+  if (buffer_size < header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer requires at least %d bytes, but is "
+                "allocated with %d bytes",
+                header_offsets, buffer_size);
+    return false;
+  }
+
+  uint32_t prev_ptr = header_offsets;
+  uint32_t offset = sizeof(int32_t);
+
+  if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer initial offset must be: %d",
+                header_offsets);
+    return false;
+  }
+  offset += sizeof(int32_t);
+  for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
+    int string_offset = *GetIntPtr(buffer_ptr + offset);
+    if (string_offset < prev_ptr || string_offset > buffer_size) {
+      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
+                  i);
+      return false;
+    }
+  }
+  if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
+    ReportError(error_reporter, "String tensor buffer last offset must be %d",
+                buffer_size);
+    return false;
+  }
+  return true;
+}
+
+// Verifies numeric tensor has legit buffer.
+bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
+                               ErrorReporter* error_reporter) {
+  uint64_t bytes_required = 1;
+  for (int dim : *tensor.shape()) {
+    bytes_required *= dim;
+    if (bytes_required > UINT_MAX) {
+      ReportError(error_reporter, "Tensor dimension overflow");
+      return false;
+    }
+  }
+  switch (tensor.type()) {
+    case TensorType_FLOAT32:
+      bytes_required *= sizeof(float);
+      break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
+    case TensorType_UINT8:
+      bytes_required *= sizeof(uint8_t);
+      break;
+    case TensorType_INT64:
+      bytes_required *= sizeof(int64_t);
+      break;
+    case TensorType_FLOAT16:
+      // FALLTHROUGH_INTENDED;
+    default:
+      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      return false;
+  }
+  if (bytes_required > UINT_MAX) {
+    ReportError(error_reporter, "Tensor dimension overflow");
+    return false;
+  }
+
+  if (bytes_required != buffer.data()->size()) {
+    ReportError(
+        error_reporter,
+        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
+        bytes_required, buffer.data()->size());
+    return false;
+  }
+  return true;
+
+  // TODO(yichengfan): verify quantized tensors.
+}
+
+// Verifies tensors have valid properties and legit buffer if set.
+bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
+  if (!model.subgraphs()) {
+    return true;
+  }
+  for (const auto& subgraph : *model.subgraphs()) {
+    if (!subgraph->tensors()) {
+      return true;
+    }
+    for (const auto& tensor : *subgraph->tensors()) {
+      if (!tensor->buffer()) {
+        return true;
+      }
+      if (tensor->buffer() >= model.buffers()->size()) {
+        ReportError(error_reporter, "Invalid tensor buffer index: %d",
+                    tensor->buffer());
+        return false;
+      }
+      auto* buffer = model.buffers()->Get(tensor->buffer());
+      if (!buffer || !buffer->data()) {
+        ReportError(error_reporter, "Tensor buffer %d not set",
+                    tensor->buffer());
+        return false;
+      }
+
+      if (tensor->type() == TensorType_STRING) {
+        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          return false;
+        }
+      } else {
+        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter) {
+  const Model* model = VerifyFlatbufferAndGetModel(buf, len);
+  if (model == nullptr) {
+    ReportError(error_reporter, "Invalid flatbuffer format");
+    return false;
+  }
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    ReportError(error_reporter, "Invalid model version %d", model->version());
+    return false;
+  }
+  if (!VerifyTensors(*model, error_reporter)) {
+    return false;
+  }
+  return true;
+}
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2bf3c91d54225098c1f254c26971e8bb962f791
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+
+#include <stdio.h>
+
+#include "tensorflow/contrib/lite/error_reporter.h"
+
+namespace tflite {
+
+// Verifies the integrity of a Tensorflow Lite flatbuffer model file.
+// Currently, it verifies:
+// * The file is following a legit flatbuffer schema.
+// * The model is in supported version.
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..244d4f0396bb6733905606c486b0b93457b887a9
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -0,0 +1,236 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/tools/verifier.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/util.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace tflite {
+
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+
+// Build single subgraph model.
+class TfLiteFlatbufferModelBuilder {
+ public:
+  TfLiteFlatbufferModelBuilder() {
+    buffers_.push_back(
+        CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
+  }
+
+  void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
+                 const std::vector<uint8_t>& buffer, const char* name) {
+    int buffer_index = 0;
+    if (!buffer.empty()) {
+      buffer_index = buffers_.size();
+      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
+    }
+    tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
+                                          name, /*quantization=*/0));
+  }
+
+  void AddOperator(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs,
+                   tflite::BuiltinOperator builtin_op, const char* custom_op) {
+    operator_codes_.push_back(
+        CreateOperatorCodeDirect(builder_, builtin_op, custom_op));
+    operators_.push_back(CreateOperator(
+        builder_, operator_codes_.size() - 1, builder_.CreateVector(inputs),
+        builder_.CreateVector(outputs), BuiltinOptions_NONE,
+        /*builtin_options=*/0,
+        /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS));
+  }
+
+  void FinishModel(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs) {
+    auto subgraph = std::vector<Offset<SubGraph>>({CreateSubGraph(
+        builder_, builder_.CreateVector(tensors_),
+        builder_.CreateVector(inputs), builder_.CreateVector(outputs),
+        builder_.CreateVector(operators_),
+        builder_.CreateString("test_subgraph"))});
+    auto result = CreateModel(
+        builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(operator_codes_),
+        builder_.CreateVector(subgraph), builder_.CreateString("test_model"),
+        builder_.CreateVector(buffers_));
+    tflite::FinishModelBuffer(builder_, result);
+  }
+
+  bool Verify() {
+    return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
+                          DefaultErrorReporter());
+  }
+
+ private:
+  FlatBufferBuilder builder_;
+  std::vector<Offset<Operator>> operators_;
+  std::vector<Offset<OperatorCode>> operator_codes_;
+  std::vector<Offset<Tensor>> tensors_;
+  std::vector<Offset<Buffer>> buffers_;
+};
+
+TEST(VerifyModel, TestEmptyModel) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/TFLITE_SCHEMA_VERSION,
+                           /*operator_codes=*/0, /*subgraphs=*/0,
+                           /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                     DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, TestSimpleModel) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
+}
+
+TEST(VerifyModel, TestCorruptedData) {
+  string model = "123";
+  ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
+}
+
+TEST(VerifyModel, TestUnsupportedVersion) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
+                           /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/TFLITE_SCHEMA_VERSION,
+                           /*operator_codes=*/0,
+                           /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+
+  string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
+                       builder.GetSize());
+  for (int i = 0; i < model_content.size(); i++) {
+    model_content[i] = (model_content[i] + 137) % 255;
+    EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
+                        DefaultErrorReporter()))
+        << "Fail at position: " << i;
+  }
+}
+
+TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeOverflow) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({1024, 2048, 4096}, TensorType_UINT8, {1, 2, 3, 4},
+                    "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TensorBufferIsNotValid) {
+  FlatBufferBuilder builder;
+  std::vector<int> shape = {2, 3};
+  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/2,
+                         "input", /*quantization=*/0)});
+  auto subgraph = std::vector<Offset<SubGraph>>(
+      {CreateSubGraph(builder, tensors, /*inputs=*/0, /*outputs=*/0,
+                      /*operators=*/0, builder.CreateString("Main"))});
+
+  auto buffers = builder.CreateVector(std::vector<Offset<Buffer>>{
+      CreateBuffer(builder,
+                   builder.CreateVector(std::vector<uint8>{1, 2, 3, 4, 5, 6})),
+  });
+
+  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0,
+                           builder.CreateVector(subgraph),
+                           builder.CreateString("SmartReply"), buffers);
+
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, StringTensorHasInvalidNumString) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {0x00, 0x00, 0x00, 0x20, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetTooSmall) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetOutOfRange) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorIsLargerThanRequired) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B', 'C'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+// TODO(yichengfan): make up malicious files to test with.
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/visualize.py b/tensorflow/contrib/lite/tools/visualize.py
index d0d78e3afab7d89f216bb8ceb42e4429ca4f1759..f571dd59da0a3f4aff264b48fba3e41f75b50404 100644
--- a/tensorflow/contrib/lite/tools/visualize.py
+++ b/tensorflow/contrib/lite/tools/visualize.py
@@ -198,10 +198,13 @@ class TensorMapper(object):
 
 def GenerateGraph(subgraph_idx, g, opcode_mapper):
   """Produces the HTML required to have a d3 visualization of the dag."""
+
   def TensorName(idx):
-    return "t%d"%idx
+    return "t%d" % idx
+
   def OpName(idx):
-    return "o%d"%idx
+    return "o%d" % idx
+
   edges = []
   nodes = []
   first = {}
@@ -210,27 +213,35 @@ def GenerateGraph(subgraph_idx, g, opcode_mapper):
     for tensor_input_position, tensor_index in enumerate(op["inputs"]):
       if tensor_index not in first:
         first[tensor_index] = (
-            op_index*pixel_mult,
-            tensor_input_position*pixel_mult - pixel_mult/2)
-      edges.append(
-          {"source": TensorName(tensor_index), "target": OpName(op_index)})
+            op_index * pixel_mult,
+            tensor_input_position * pixel_mult - pixel_mult / 2)
+      edges.append({
+          "source": TensorName(tensor_index),
+          "target": OpName(op_index)
+      })
     for tensor_index in op["outputs"]:
-      edges.append(
-          {"target": TensorName(tensor_index), "source": OpName(op_index)})
-    nodes.append({"id": OpName(op_index),
-                  "name": opcode_mapper(op["opcode_index"]),
-                  "group": 2,
-                  "x": pixel_mult,
-                  "y": op_index * pixel_mult})
+      edges.append({
+          "target": TensorName(tensor_index),
+          "source": OpName(op_index)
+      })
+    nodes.append({
+        "id": OpName(op_index),
+        "name": opcode_mapper(op["opcode_index"]),
+        "group": 2,
+        "x": pixel_mult,
+        "y": op_index * pixel_mult
+    })
   for tensor_index, tensor in enumerate(g["tensors"]):
-    initial_y = (first[tensor_index] if tensor_index in first
-                 else len(g["operators"]))
-
-    nodes.append({"id": TensorName(tensor_index),
-                  "name": "%s (%d)" % (tensor["name"], tensor_index),
-                  "group": 1,
-                  "x": 2,
-                  "y": initial_y})
+    initial_y = (
+        first[tensor_index] if tensor_index in first else len(g["operators"]))
+
+    nodes.append({
+        "id": TensorName(tensor_index),
+        "name": "%s (%d)" % (tensor["name"], tensor_index),
+        "group": 1,
+        "x": 2,
+        "y": initial_y
+    })
   graph_str = json.dumps({"nodes": nodes, "edges": edges})
 
   html = _D3_HTML_TEMPLATE % (graph_str, subgraph_idx)
@@ -267,7 +278,7 @@ def GenerateTableHtml(items, keys_to_print, display_index=True):
     for h, mapper in keys_to_print:
       val = tensor[h] if h in tensor else None
       val = val if mapper is None else mapper(val)
-      html += "<td>%s</td>\n"%val
+      html += "<td>%s</td>\n" % val
 
     html += "</tr>\n"
   html += "</table>\n"
@@ -279,18 +290,19 @@ def CreateHtmlFile(tflite_input, html_output):
 
   # Convert the model into a JSON flatbuffer using flatc (build if doesn't
   # exist.
-  if  not os.path.exists(tflite_input):
+  if not os.path.exists(tflite_input):
     raise RuntimeError("Invalid filename %r" % tflite_input)
   if tflite_input.endswith(".tflite") or tflite_input.endswith(".bin"):
 
     # Run convert
-    cmd = (_BINARY + " -t "
-           "--strict-json --defaults-json -o /tmp {schema} -- {input}".format(
-               input=tflite_input, schema=_SCHEMA))
+    cmd = (
+        _BINARY + " -t "
+        "--strict-json --defaults-json -o /tmp {schema} -- {input}".format(
+            input=tflite_input, schema=_SCHEMA))
     print(cmd)
     os.system(cmd)
-    real_output = ("/tmp/"+ os.path.splitext(os.path.split(tflite_input)[-1])[0]
-                   + ".json")
+    real_output = ("/tmp/" + os.path.splitext(
+        os.path.split(tflite_input)[-1])[0] + ".json")
 
     data = json.load(open(real_output))
   elif tflite_input.endswith(".json"):
@@ -302,12 +314,13 @@ def CreateHtmlFile(tflite_input, html_output):
   html += "<h1>TensorFlow Lite Model</h2>"
 
   data["filename"] = tflite_input  # Avoid special case
-  toplevel_stuff = [("filename", None), ("version", None),
-                    ("description", None)]
+  toplevel_stuff = [("filename", None), ("version", None), ("description",
+                                                            None)]
 
   html += "<table>\n"
   for key, mapping in toplevel_stuff:
-    if not mapping: mapping = lambda x: x
+    if not mapping:
+      mapping = lambda x: x
     html += "<tr><th>%s</th><td>%s</td></tr>\n" % (key, mapping(data[key]))
   html += "</table>\n"
 
@@ -320,22 +333,22 @@ def CreateHtmlFile(tflite_input, html_output):
     html += "<div class='subgraph'>"
     tensor_mapper = TensorMapper(g)
     opcode_mapper = OpCodeMapper(data)
-    op_keys_to_display = [
-        ("inputs", tensor_mapper), ("outputs", tensor_mapper),
-        ("builtin_options", None), ("opcode_index", opcode_mapper)]
-    tensor_keys_to_display = [
-        ("name", None), ("type", None), ("shape", None), ("buffer", None),
-        ("quantization", None)]
+    op_keys_to_display = [("inputs", tensor_mapper), ("outputs", tensor_mapper),
+                          ("builtin_options", None), ("opcode_index",
+                                                      opcode_mapper)]
+    tensor_keys_to_display = [("name", None), ("type", None), ("shape", None),
+                              ("buffer", None), ("quantization", None)]
 
     html += "<h2>Subgraph %d</h2>\n" % subgraph_idx
 
     # Inputs and outputs.
     html += "<h3>Inputs/Outputs</h3>\n"
-    html += GenerateTableHtml([{"inputs": g["inputs"],
-                                "outputs": g["outputs"]}],
-                              [("inputs", tensor_mapper),
-                               ("outputs", tensor_mapper)],
-                              display_index=False)
+    html += GenerateTableHtml(
+        [{
+            "inputs": g["inputs"],
+            "outputs": g["outputs"]
+        }], [("inputs", tensor_mapper), ("outputs", tensor_mapper)],
+        display_index=False)
 
     # Print the tensors.
     html += "<h3>Tensors</h3>\n"
@@ -357,8 +370,7 @@ def CreateHtmlFile(tflite_input, html_output):
 
   # Operator codes
   html += "<h2>Operator Codes</h2>\n"
-  html += GenerateTableHtml(data["operator_codes"],
-                            operator_keys_to_display)
+  html += GenerateTableHtml(data["operator_codes"], operator_keys_to_display)
 
   html += "</body></html>\n"
 
@@ -370,10 +382,10 @@ def main(argv):
     tflite_input = argv[1]
     html_output = argv[2]
   except IndexError:
-    print ("Usage: %s <input tflite> <output html>" % (argv[0]))
+    print("Usage: %s <input tflite> <output html>" % (argv[0]))
   else:
     CreateHtmlFile(tflite_input, html_output)
 
+
 if __name__ == "__main__":
   main(sys.argv)
-
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 7c523ad49265aaf32c8d5a8ae04d3e93262a1b55..8c3a8afe7a0f6f5ad9ceae566288ba60be73d339 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -30,20 +30,13 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 
-__all__ = ["absolute_difference",
-           "add_loss",
-           "cosine_distance",
-           "compute_weighted_loss",
-           "get_losses",
-           "get_regularization_losses",
-           "get_total_loss",
-           "hinge_loss",
-           "log_loss",
-           "mean_pairwise_squared_error",
-           "mean_squared_error",
-           "sigmoid_cross_entropy",
-           "softmax_cross_entropy",
-           "sparse_softmax_cross_entropy"]
+__all__ = [
+    "absolute_difference", "add_loss", "cosine_distance",
+    "compute_weighted_loss", "get_losses", "get_regularization_losses",
+    "get_total_loss", "hinge_loss", "log_loss", "mean_pairwise_squared_error",
+    "mean_squared_error", "sigmoid_cross_entropy", "softmax_cross_entropy",
+    "sparse_softmax_cross_entropy"
+]
 
 
 def _scale_losses(losses, weights):
@@ -66,8 +59,8 @@ def _scale_losses(losses, weights):
   # First, compute the sum of the losses over all elements:
   start_index = max(0, weights.get_shape().ndims)
   reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(losses,
-                                       reduction_indices=reduction_indices)
+  reduced_losses = math_ops.reduce_sum(
+      losses, reduction_indices=reduction_indices)
   reduced_losses = math_ops.multiply(reduced_losses, weights)
   return math_ops.reduce_sum(reduced_losses)
 
@@ -90,9 +83,10 @@ def _safe_div(numerator, denominator, name="value"):
   """
   return array_ops.where(
       math_ops.greater(denominator, 0),
-      math_ops.div(numerator, array_ops.where(
-          math_ops.equal(denominator, 0),
-          array_ops.ones_like(denominator), denominator)),
+      math_ops.div(numerator,
+                   array_ops.where(
+                       math_ops.equal(denominator, 0),
+                       array_ops.ones_like(denominator), denominator)),
       array_ops.zeros_like(numerator),
       name=name)
 
@@ -176,14 +170,15 @@ def _num_present(losses, weights, per_batch=False):
   """
   # If weights is a scalar, its easy to compute:
   if weights.get_shape().ndims == 0:
-    batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
-                                                   [0], [1]), [])
-    num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
-                                 math_ops.to_float(batch_size))
-    num_per_batch = array_ops.where(math_ops.equal(weights, 0),
-                                    0.0, num_per_batch)
-    num_per_batch = math_ops.multiply(array_ops.ones(
-        array_ops.reshape(batch_size, [1])), num_per_batch)
+    batch_size = array_ops.reshape(
+        array_ops.slice(array_ops.shape(losses), [0], [1]), [])
+    num_per_batch = math_ops.div(
+        math_ops.to_float(array_ops.size(losses)),
+        math_ops.to_float(batch_size))
+    num_per_batch = array_ops.where(
+        math_ops.equal(weights, 0), 0.0, num_per_batch)
+    num_per_batch = math_ops.multiply(
+        array_ops.ones(array_ops.reshape(batch_size, [1])), num_per_batch)
     return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
 
   # First, count the number of nonzero weights:
@@ -194,8 +189,8 @@ def _num_present(losses, weights, per_batch=False):
         reduction_indices=reduction_indices)
 
   # Next, determine the number of elements that weights would broadcast to:
-  broadcast_dims = array_ops.slice(array_ops.shape(losses),
-                                   [weights.get_shape().ndims], [-1])
+  broadcast_dims = array_ops.slice(
+      array_ops.shape(losses), [weights.get_shape().ndims], [-1])
   num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))
 
   num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast)
@@ -303,8 +298,11 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.sigmoid_cross_entropy instead. Note that the order "
             "of the predictions and labels arguments has been changed.")
-def sigmoid_cross_entropy(
-    logits, multi_class_labels, weights=1.0, label_smoothing=0, scope=None):
+def sigmoid_cross_entropy(logits,
+                          multi_class_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -340,20 +338,22 @@ def sigmoid_cross_entropy(
     multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
 
     if label_smoothing > 0:
-      multi_class_labels = (multi_class_labels * (1 - label_smoothing) +
-                            0.5 * label_smoothing)
+      multi_class_labels = (
+          multi_class_labels * (1 - label_smoothing) + 0.5 * label_smoothing)
 
-    losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.sigmoid_cross_entropy_with_logits(
+        labels=multi_class_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
 @deprecated("2016-12-30",
             "Use tf.losses.softmax_cross_entropy instead. Note that the order "
             "of the logits and labels arguments has been changed.")
-def softmax_cross_entropy(
-    logits, onehot_labels, weights=1.0, label_smoothing=0, scope=None):
+def softmax_cross_entropy(logits,
+                          onehot_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -393,9 +393,8 @@ def softmax_cross_entropy(
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
 
-    losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.softmax_cross_entropy_with_logits(
+        labels=onehot_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -429,9 +428,8 @@ def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None):
                       [logits, labels, weights]) as scope:
     labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]])
 
-    losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
-                                                         logits=logits,
-                                                         name="xentropy")
+    losses = nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -470,8 +468,7 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
     predictions = math_ops.to_float(predictions)
     labels = math_ops.to_float(labels)
     losses = -math_ops.multiply(
-        labels,
-        math_ops.log(predictions + epsilon)) - math_ops.multiply(
+        labels, math_ops.log(predictions + epsilon)) - math_ops.multiply(
             (1 - labels), math_ops.log(1 - predictions + epsilon))
     return compute_weighted_loss(losses, weights, scope=scope)
 
@@ -490,7 +487,8 @@ def hinge_loss(logits, labels=None, scope=None):
     scope: The scope for the operations performed in computing the loss.
 
   Returns:
-    An unweighted `Tensor` of same shape as `logits` and `labels` representing the
+    An unweighted `Tensor` of same shape as `logits` and `labels` representing
+    the
       loss values across the batch.
 
   Raises:
@@ -544,8 +542,10 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.mean_pairwise_squared_error instead. Note that the "
             "order of the predictions and labels arguments has been changed.")
-def mean_pairwise_squared_error(
-    predictions, labels=None, weights=1.0, scope=None):
+def mean_pairwise_squared_error(predictions,
+                                labels=None,
+                                weights=1.0,
+                                scope=None):
   """Adds a pairwise-errors-squared loss to the training procedure.
 
   Unlike `mean_squared_error`, which is a measure of the differences between
@@ -602,31 +602,34 @@ def mean_pairwise_squared_error(
     reduction_indices = list(range(1, diffs.get_shape().ndims))
 
     sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs),
-        reduction_indices=reduction_indices)
+        math_ops.square(diffs), reduction_indices=reduction_indices)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch)
+    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, num_present_per_batch)
 
     sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch))
+    term2 = 2.0 * _safe_div(
+        math_ops.square(sum_diff), math_ops.square(num_present_per_batch))
 
     loss = _scale_losses(term1 - term2, weights)
 
-    mean_loss = array_ops.where(math_ops.reduce_sum(num_present_per_batch) > 0,
-                                loss,
-                                array_ops.zeros_like(loss),
-                                name="value")
+    mean_loss = array_ops.where(
+        math_ops.reduce_sum(num_present_per_batch) > 0,
+        loss,
+        array_ops.zeros_like(loss),
+        name="value")
     add_loss(mean_loss)
     return mean_loss
 
 
 @deprecated("2016-12-30", "Use tf.losses.cosine_distance instead.")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
-def cosine_distance(
-    predictions, labels=None, axis=None, weights=1.0, scope=None, dim=None):
+def cosine_distance(predictions,
+                    labels=None,
+                    axis=None,
+                    weights=1.0,
+                    scope=None,
+                    dim=None):
   """Adds a cosine-distance loss to the training procedure.
 
   Note that the function assumes that `predictions` and `labels` are already
@@ -662,5 +665,8 @@ def cosine_distance(
     labels = math_ops.to_float(labels)
 
     radial_diffs = math_ops.multiply(predictions, labels)
-    losses = 1 - math_ops.reduce_sum(radial_diffs, reduction_indices=[axis,])
+    losses = 1 - math_ops.reduce_sum(
+        radial_diffs, reduction_indices=[
+            axis,
+        ])
     return compute_weighted_loss(losses, weights, scope=scope)
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 9d0f95e6f3e7fa9666a99e31578b38d52e0b6b4a..1417772e0496cb571488e5b30bd4f3fb1b591730 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -274,6 +275,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
+@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -471,7 +473,11 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      if ops._USE_C_API:
+        error_type = ValueError
+      else:
+        error_type = errors_impl.InvalidArgumentError
+      with self.assertRaises(error_type):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index dd5770dc996b3efab8647a5e3ee4a069593c679b..81327407d44b4317b7aecb964a689a35aa35c163 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -377,10 +377,10 @@ $(MARCH_OPTION) \
 
 	ifeq ($(BUILD_FOR_TEGRA),1)
 		NVCC := $(JETPACK)/cuda/bin/nvcc
-		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DNVIDIA_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
+		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DANDROID_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
 		CXXFLAGS4NVCC =\
 -DIS_SLIM_BUILD \
--DNVIDIA_TEGRA \
+-DANDROID_TEGRA \
 -fno-exceptions \
 -DNDEBUG $(OPTFLAGS) \
 -march=armv8-a \
@@ -391,7 +391,7 @@ $(MARCH_OPTION) \
 		CXXFLAGS +=\
 -DGOOGLE_CUDA=1 \
 -D__ANDROID_TYPES_FULL__ \
--DNVIDIA_TEGRA \
+-DANDROID_TEGRA \
 -DEIGEN_AVOID_STL_ARRAY \
 -DEIGEN_HAS_C99_MATH \
 -DLANG_CXX11 -DTENSORFLOW_USE_EIGEN_THREADPOOL -DTF_EXTRA_CUDA_CAPABILITIES=5.3
@@ -407,7 +407,7 @@ $(MARCH_OPTION) \
 -I$(JETPACK)/cuda/extras/CUPTI/include
 
 
-		LIBS += \
+		CUDA_LIBS := \
 -ltfcuda \
 -lcudart_static \
 -lcudnn \
@@ -420,10 +420,10 @@ $(MARCH_OPTION) \
 -lculibos \
 -lcurand_static
 
-		OBJDIR := $(OBJDIR)Tegra/
-		LIBDIR := $(LIBDIR)Tegra/
-		BINDIR := $(BINDIR)Tegra/
-		DEPDIR := $(DEPDIR)Tegra/
+		OBJDIR := $(OBJDIR)android_arm64-v8a/
+		LIBDIR := $(LIBDIR)android_arm64-v8a/
+		BINDIR := $(BINDIR)android_arm64-v8a/
+		DEPDIR := $(DEPDIR)android_arm64-v8a/
 
 		TEGRA_LIBS := \
 -L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib \
@@ -606,7 +606,8 @@ $(wildcard tensorflow/core/util/*/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
-CORE_CC_EXCLUDE_SRCS := \
+
+CORE_CC_EXCLUDE_SRCS_NON_GPU := \
 $(wildcard tensorflow/core/*/*test.cc) \
 $(wildcard tensorflow/core/*/*testutil*) \
 $(wildcard tensorflow/core/*/*testlib*) \
@@ -626,49 +627,31 @@ $(wildcard tensorflow/core/lib/jpeg/*) \
 $(wildcard tensorflow/core/lib/png/*) \
 $(wildcard tensorflow/core/util/events_writer.*) \
 $(wildcard tensorflow/core/util/reporter.*) \
-$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
-$(wildcard tensorflow/core/platform/default/stream_executor.*) \
 $(wildcard tensorflow/core/platform/default/test_benchmark.*) \
-$(wildcard tensorflow/core/platform/cuda.h) \
-$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
 $(wildcard tensorflow/core/platform/cloud/*) \
 $(wildcard tensorflow/core/platform/google/*) \
 $(wildcard tensorflow/core/platform/google/*/*) \
 $(wildcard tensorflow/core/platform/jpeg.*) \
 $(wildcard tensorflow/core/platform/png.*) \
 $(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/stream_executor.*) \
 $(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/user_ops/*.cu.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*) \
-$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \
 $(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \
 $(wildcard tensorflow/core/grappler/inputs/file_input_yielder.*) \
-$(wildcard tensorflow/core/grappler/clusters/single_machine.*)
+$(wildcard tensorflow/core/grappler/clusters/single_machine.*) \
+tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+CORE_CC_EXCLUDE_SRCS := \
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU) \
+$(wildcard tensorflow/core/platform/stream_executor.*) \
+$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/platform/cuda.h) \
+$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/user_ops/*.cu.cc) \
+$(wildcard tensorflow/core/common_runtime/gpu/*) \
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
 
 ifeq ($(BUILD_FOR_TEGRA),1)
-CORE_CC_ALL_SRCS := \
-$(wildcard tensorflow/core/*.cc) \
-$(wildcard tensorflow/core/common_runtime/*.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
-$(wildcard tensorflow/core/framework/*.cc) \
-$(wildcard tensorflow/core/graph/*.cc) \
-$(wildcard tensorflow/core/platform/*.cc) \
-$(wildcard tensorflow/core/platform/*/*.cc) \
-$(wildcard tensorflow/core/platform/*/*/*.cc) \
-$(wildcard tensorflow/core/util/*.cc) \
-$(wildcard tensorflow/core/util/*/*.cc) \
-$(wildcard tensorflow/cc/training/*.cc) \
-$(wildcard tensorflow/stream_executor/*.cc) \
-$(wildcard tensorflow/stream_executor/*/*.cc) \
-$(wildcard tensorflow/core/grappler/optimizers/*.cc) \
-$(wildcard tensorflow/core/grappler/*.cc) \
-$(wildcard tensorflow/core/grappler/costs/*.cc) \
-$(wildcard tensorflow/core/grappler/clusters/*.cc) \
-$(wildcard tensorflow/core/grappler/utils/*.cc) \
-$(wildcard tensorflow/core/lib/core/*.cc) \
-$(wildcard tensorflow/core/lib/*/*.cc) \
-tensorflow/core/grappler/inputs/utils.cc \
+CORE_CC_ALL_SRCS := $(CORE_CC_ALL_SRCS) \
 tensorflow/core/kernels/concat_lib_gpu.cc \
 tensorflow/core/kernels/cuda_solvers.cc \
 tensorflow/core/kernels/cudnn_pooling_gpu.cc \
@@ -677,28 +660,14 @@ tensorflow/core/kernels/fractional_avg_pool_op.cc \
 tensorflow/core/kernels/fractional_max_pool_op.cc \
 tensorflow/core/kernels/fractional_pool_common.cc \
 tensorflow/core/kernels/pooling_ops_3d.cc \
-tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+tensorflow/core/kernels/sparse_fill_empty_rows_op.cc \
+tensorflow/core/kernels/list_kernels.cc \
+$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
+$(wildcard tensorflow/stream_executor/*.cc) \
+$(wildcard tensorflow/stream_executor/*/*.cc)
 
 CORE_CC_EXCLUDE_SRCS := \
-$(wildcard tensorflow/core/*/*test.cc) \
-$(wildcard tensorflow/core/*/*testutil*) \
-$(wildcard tensorflow/core/*/*testlib*) \
-$(wildcard tensorflow/core/*/*/*test.cc) \
-$(wildcard tensorflow/core/*/*/*testutil*) \
-$(wildcard tensorflow/core/framework/op_gen_lib.cc) \
-$(wildcard tensorflow/core/lib/gif/*) \
-$(wildcard tensorflow/core/lib/jpeg/*) \
-$(wildcard tensorflow/core/lib/png/*) \
-$(wildcard tensorflow/core/lib/db/*) \
-$(wildcard tensorflow/core/platform/jpeg.*) \
-$(wildcard tensorflow/core/platform/png.*) \
-$(wildcard tensorflow/core/platform/cloud/*) \
-$(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/*/*/*testlib*) \
-$(wildcard tensorflow/cc/training/*test.cc) \
-tensorflow/core/lib/io/record_reader.cc \
-tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU)
 
 CUDA_CC_SRCS := $(wildcard tensorflow/core/kernels/*.cu.cc)
 CUDA_CC_OBJS := $(addprefix $(OBJDIR), $(CUDA_CC_SRCS:.cc=.o))
@@ -760,7 +729,7 @@ $(BENCHMARK_NAME): $(BENCHMARK_OBJS) $(LIB_PATH) $(CUDA_LIB_DEPS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(BENCHMARK_NAME) $(BENCHMARK_OBJS) \
-	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS) $(CUDA_LIBS)
 
 # NVCC compilation rules for Tegra
 ifeq ($(BUILD_FOR_TEGRA),1)
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 980a44a5952a098da8a00e666d37a6d1642f4095..f67c5161861d20ba15ba165491365cdfd0239047 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -18,7 +18,7 @@
 set -e
 
 usage() {
-  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a:X]"
+  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a]"
   echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index a18df256f976c3c0ac4cefe1c884d951e63ef823..2d9979183975e6a17527b40ef5ee1795ced44a7b 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -96,7 +96,7 @@ if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
 
     if [[ -z "${BUILD_ARCH}" ]]; then
         # Compile protobuf for the target iOS device architectures.
-        tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${DEFAULT_ARCH}
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh
     else
         # Compile protobuf for the target iOS device architectures.
         tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${BUILD_ARCH}
diff --git a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
index 861bb885c7031b996b48dbc50887cfce55c638f3..203ff4f890a3b0ed32caa1406508b100dd47bcad 100755
--- a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@@ -76,6 +76,8 @@ GEN_LIBS_DIR="${GEN_DIR}/libs"
 GEN_DOWNLOAD_DIR="${GEN_DIR}/downloads"
 URL_BASE="https://storage.googleapis.com/download.tensorflow.org"
 
+ARCH="armeabi-v7a"
+
 source "${SCRIPT_DIR}/../build_helper.subr"
 
 rm -rf "${GEN_DIR}"
@@ -219,7 +221,7 @@ if [[ "${BUILD_ONLY}" != "true" ]]; then
     adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 
     adb push -p \
-        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" \
+        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/android_${ARCH}/hexagon_graph_execution" \
         "/data/local/tmp/"
     adb wait-for-device
     adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
index d9277ed60cb456208572ca1ad8df530648faef82..3081084ee76e41de801f49a67c1fec07f4ff03b9 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
@@ -54,7 +54,7 @@ $(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS) $(CUDA_LIB_DEPS)
 	-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) $(TEGRA_LIBS) \
 	$(LIBFLAGS) $(LDFLAGS) \
 	-shared -Wl,-soname,$(INFERENCE_SO_NAME) \
-	$(LIBS)
+	$(LIBS) $(CUDA_LIBS)
 
 $(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH)
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 5f275663986f9d480659880ab601eeb5c41037be..5a812af4e95fe7a05b9c2634b0cc1d860fb7f619 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -91,6 +91,7 @@ tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
 tensorflow/core/kernels/reduction_ops_all.cc
+tensorflow/core/kernels/roll_op.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -270,6 +271,7 @@ tensorflow/core/ops/parsing_ops.cc
 tensorflow/core/ops/no_op.cc
 tensorflow/core/ops/nn_ops.cc
 tensorflow/core/ops/nn_grad.cc
+tensorflow/core/ops/manip_ops.cc
 tensorflow/core/ops/math_ops.cc
 tensorflow/core/ops/math_grad.cc
 tensorflow/core/ops/logging_ops.cc
@@ -291,3 +293,4 @@ tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
+tensorflow/core/ops/audio_ops.cc
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 39c0d5af45b4a81fa4dde0b5deac14a3af372cbb..974fb537499c5ea4591a0a128f53d2dea67b9e57 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -80,9 +80,9 @@ REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_GPU).HostMemory("out"),
                         BytesLimitOp);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"),
-                        BytesLimitOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"), BytesLimitOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 // Op that measures the peak memory in bytes.
 class MaxBytesInUseOp : public MemoryStatsOp {
@@ -107,6 +107,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("MaxBytesInUse").Device(DEVICE_SYCL).HostMemory("out"),
     MaxBytesInUseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index 2932ae1c8df32cd936cff932b061571c513fda79..ff88b4fa841673fc52b9f6fdc5ca43d30c44bbfd 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -171,7 +171,14 @@ def _clean_save_and_restore(graph_def, op, removed_op_names):
   shape_op_value_tensor.tensor_shape.dim[0].size = len(shapes)
   op.attr['dtypes'].list.type[:] = dtypes
 
+  if not name_op.attr['_output_shapes'].list.shape:
+    name_op.attr['_output_shapes'].list.shape.add()
+    name_op.attr['_output_shapes'].list.shape[0].dim.add()
   name_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(names)
+
+  if not shape_op.attr['_output_shapes'].list.shape:
+    shape_op.attr['_output_shapes'].list.shape.add()
+    shape_op.attr['_output_shapes'].list.shape[0].dim.add()
   shape_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(shapes)
 
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index c3de1c4c62f04c7ef3d85f36662805c0c0ec4b4c..55946c128b1a46b8368aedd9f857c1902c4c4586 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -339,9 +339,9 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.accuracy. Note that the order of the '
-    'labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.accuracy. Note that the order of the '
+            'labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -936,8 +936,9 @@ def streaming_curve_points(labels=None,
     if curve != 'ROC' and curve != 'PR':
       raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
     kepsilon = _EPSILON  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds - 2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
     values, update_ops = _streaming_confusion_matrix_at_thresholds(
@@ -973,9 +974,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.auc. Note that the order of the '
-    'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
+            'labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1105,8 +1105,7 @@ def _compute_dynamic_auc(labels, predictions, curve='ROC'):
       # For conformance, set precision to 1 when the number of positive
       # classifications is 0.
       y_axis_values = array_ops.where(
-          math_ops.greater(splits, 0),
-          math_ops.truediv(true_positives, splits),
+          math_ops.greater(splits, 0), math_ops.truediv(true_positives, splits),
           array_ops.ones_like(true_positives, dtype=dtypes.float64))
 
     # Calculate trapezoid areas.
@@ -1119,9 +1118,8 @@ def _compute_dynamic_auc(labels, predictions, curve='ROC'):
   # exception seems excessive) so we return 0, otherwise we finish computing.
   return control_flow_ops.cond(
       math_ops.logical_or(
-          math_ops.equal(total_positive, 0),
-          math_ops.equal(total_positive, size)
-      ),
+          math_ops.equal(total_positive, 0), math_ops.equal(
+              total_positive, size)),
       true_fn=lambda: array_ops.constant(0, dtypes.float64),
       false_fn=continue_computing_dynamic_auc)
 
@@ -1185,10 +1183,10 @@ def streaming_dynamic_auc(labels,
             array_ops.ones_like(labels, dtypes.int64),
             message='labels must be 0 or 1, at least one is >1')
     ]):
-      preds_accum, update_preds = streaming_concat(predictions,
-                                                   name='concat_preds')
-      labels_accum, update_labels = streaming_concat(labels,
-                                                     name='concat_labels')
+      preds_accum, update_preds = streaming_concat(
+          predictions, name='concat_preds')
+      labels_accum, update_labels = streaming_concat(
+          labels, name='concat_labels')
       update_op = control_flow_ops.group(update_labels, update_preds)
       auc = _compute_dynamic_auc(labels_accum, preds_accum, curve=curve)
       if updates_collections:
@@ -1571,9 +1569,9 @@ def streaming_precision_at_thresholds(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.recall_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.recall_at_thresholds. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_recall_at_thresholds(predictions,
                                    labels,
                                    thresholds,
@@ -3299,8 +3297,13 @@ def count(values,
     return count_, update_op
 
 
-def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
-                metrics_collections=None, updates_collections=None, name=None):
+def cohen_kappa(labels,
+                predictions_idx,
+                num_classes,
+                weights=None,
+                metrics_collections=None,
+                updates_collections=None,
+                name=None):
   """Calculates Cohen's kappa.
 
   [Cohen's kappa](https://en.wikipedia.org/wiki/Cohen's_kappa) is a statistic
@@ -3367,14 +3370,15 @@ def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
       labels = array_ops.squeeze(labels, axis=[-1])
     predictions_idx, labels, weights = (
         metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
-            predictions=predictions_idx, labels=labels, weights=weights))
+            predictions=predictions_idx,
+            labels=labels,
+            weights=weights))
     predictions_idx.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    stat_dtype = (dtypes.int64
-                  if weights is None or weights.dtype.is_integer
-                  else dtypes.float32)
-    po = metrics_impl.metric_variable(
-        (num_classes,), stat_dtype, name='po')
+    stat_dtype = (
+        dtypes.int64
+        if weights is None or weights.dtype.is_integer else dtypes.float32)
+    po = metrics_impl.metric_variable((num_classes,), stat_dtype, name='po')
     pe_row = metrics_impl.metric_variable(
         (num_classes,), stat_dtype, name='pe_row')
     pe_col = metrics_impl.metric_variable(
@@ -3382,9 +3386,12 @@ def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
 
     # Table of the counts of agreement:
     counts_in_table = confusion_matrix.confusion_matrix(
-      labels, predictions_idx,
-      num_classes=num_classes, weights=weights,
-      dtype=stat_dtype, name="counts_in_table")
+        labels,
+        predictions_idx,
+        num_classes=num_classes,
+        weights=weights,
+        dtype=stat_dtype,
+        name='counts_in_table')
 
     po_t = array_ops.diag_part(counts_in_table)
     pe_row_t = math_ops.reduce_sum(counts_in_table, axis=0)
@@ -3404,12 +3411,14 @@ def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
                                math_ops.to_double(total))
       # kappa = (po - pe) / (N - pe)
       k = metrics_impl._safe_scalar_div(  # pylint: disable=protected-access
-          po_sum - pe_sum, total - pe_sum, name=name)
+          po_sum - pe_sum,
+          total - pe_sum,
+          name=name)
       return k
 
     kappa = _calculate_k(po, pe_row, pe_col, name='value')
-    update_op = _calculate_k(update_po, update_pe_row, update_pe_col,
-                             name='update_op')
+    update_op = _calculate_k(
+        update_po, update_pe_row, update_pe_col, name='update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, kappa)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 89aa29f711e3b0114a5d776b258f77214cb349bc..e067f08babd9a900e876545d427c91e5ff808f04 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -46,8 +46,7 @@ def _enqueue_vector(sess, queue, values, shape=None):
     shape = (1, len(values))
   dtype = queue.dtypes[0]
   sess.run(
-      queue.enqueue(constant_op.constant(
-          values, dtype=dtype, shape=shape)))
+      queue.enqueue(constant_op.constant(values, dtype=dtype, shape=shape)))
 
 
 def _binary_2d_label_to_sparse_value(labels):
@@ -79,8 +78,8 @@ def _binary_2d_label_to_sparse_value(labels):
     batch += 1
   shape = [len(labels), len(labels[0])]
   return sparse_tensor.SparseTensorValue(
-      np.array(indices, np.int64),
-      np.array(values, np.int64), np.array(shape, np.int64))
+      np.array(indices, np.int64), np.array(values, np.int64),
+      np.array(shape, np.int64))
 
 
 def _binary_2d_label_to_sparse(labels):
@@ -125,8 +124,8 @@ def _binary_3d_label_to_sparse_value(labels):
           assert label == 0
   shape = [len(labels), len(labels[0]), len(labels[0][0])]
   return sparse_tensor.SparseTensorValue(
-      np.array(indices, np.int64),
-      np.array(values, np.int64), np.array(shape, np.int64))
+      np.array(indices, np.int64), np.array(values, np.int64),
+      np.array(shape, np.int64))
 
 
 def _binary_3d_label_to_sparse(labels):
@@ -669,20 +668,18 @@ class StreamingTruePositivesTest(test.TestCase):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          tp, tp_update_op = metrics.streaming_true_positives(predictions,
-                                                              labels)
+          tp, tp_update_op = metrics.streaming_true_positives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -692,14 +689,12 @@ class StreamingTruePositivesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       tp, tp_update_op = metrics.streaming_true_positives(
           predictions, labels, weights=37.0)
 
@@ -717,28 +712,25 @@ class StreamingFalseNegativesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_false_negatives((0, 1, 0),
-                                      (0, 1, 1))
+    metrics.streaming_false_negatives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('false_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          fn, fn_update_op = metrics.streaming_false_negatives(predictions,
-                                                               labels)
+          fn, fn_update_op = metrics.streaming_false_negatives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -748,14 +740,12 @@ class StreamingFalseNegativesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       fn, fn_update_op = metrics.streaming_false_negatives(
           predictions, labels, weights=((3.0,), (5.0,), (7.0,)))
 
@@ -773,28 +763,25 @@ class StreamingFalsePositivesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_false_positives((0, 1, 0),
-                                      (0, 1, 1))
+    metrics.streaming_false_positives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('false_positives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          fp, fp_update_op = metrics.streaming_false_positives(predictions,
-                                                               labels)
+          fp, fp_update_op = metrics.streaming_false_positives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -804,20 +791,17 @@ class StreamingFalsePositivesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       fp, fp_update_op = metrics.streaming_false_positives(
           predictions,
           labels,
-          weights=((1.0, 2.0, 3.0, 5.0),
-                   (7.0, 11.0, 13.0, 17.0),
-                   (19.0, 23.0, 29.0, 31.0)))
+          weights=((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0), (19.0, 23.0,
+                                                                   29.0, 31.0)))
 
       with self.test_session() as sess:
         sess.run(variables.local_variables_initializer())
@@ -833,28 +817,25 @@ class StreamingTrueNegativesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_true_negatives((0, 1, 0),
-                                     (0, 1, 1))
+    metrics.streaming_true_negatives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('true_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          tn, tn_update_op = metrics.streaming_true_negatives(predictions,
-                                                              labels)
+          tn, tn_update_op = metrics.streaming_true_negatives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -864,14 +845,12 @@ class StreamingTrueNegativesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       tn, tn_update_op = metrics.streaming_true_negatives(
           predictions, labels, weights=((0.0, 2.0, 3.0, 5.0),))
 
@@ -894,12 +873,9 @@ class StreamingTruePositivesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('true_positives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tp, tp_update_op = metrics.streaming_true_positives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -910,12 +886,9 @@ class StreamingTruePositivesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((3, 1, 0), tp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tp, tp_update_op = metrics.streaming_true_positives_at_thresholds(
         predictions, labels, weights=37.0, thresholds=(0.15, 0.5, 0.85))
 
@@ -937,16 +910,14 @@ class StreamingFalseNegativesAtThresholdsTest(test.TestCase):
         (0.0, 1.0, 0.0), (0, 1, 1), thresholds=(
             0.15,
             0.5,
-            0.85,))
+            0.85,
+        ))
     _assert_metric_variables(self, ('false_negatives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fn, fn_update_op = metrics.streaming_false_negatives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -957,12 +928,9 @@ class StreamingFalseNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((0, 2, 3), fn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fn, fn_update_op = metrics.streaming_false_negatives_at_thresholds(
         predictions,
         labels,
@@ -988,12 +956,9 @@ class StreamingFalsePositivesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('false_positives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fp, fp_update_op = metrics.streaming_false_positives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -1004,18 +969,14 @@ class StreamingFalsePositivesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((7, 4, 2), fp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fp, fp_update_op = metrics.streaming_false_positives_at_thresholds(
         predictions,
         labels,
-        weights=((1.0, 2.0, 3.0, 5.0),
-                 (7.0, 11.0, 13.0, 17.0),
-                 (19.0, 23.0, 29.0, 31.0)),
+        weights=((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0), (19.0, 23.0,
+                                                                 29.0, 31.0)),
         thresholds=(0.15, 0.5, 0.85))
 
     with self.test_session() as sess:
@@ -1037,12 +998,9 @@ class StreamingTrueNegativesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('true_negatives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tn, tn_update_op = metrics.streaming_true_negatives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -1053,12 +1011,9 @@ class StreamingTrueNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((2, 5, 7), tn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tn, tn_update_op = metrics.streaming_true_negatives_at_thresholds(
         predictions,
         labels,
@@ -1393,8 +1348,7 @@ class StreamingFPRTest(test.TestCase):
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1413,8 +1367,7 @@ class StreamingFPRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(np_inputs)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1424,8 +1377,7 @@ class StreamingFPRTest(test.TestCase):
   def testSomeCorrect(self):
     predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
     labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1467,8 +1419,7 @@ class StreamingFPRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(1 - np_inputs)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1478,8 +1429,7 @@ class StreamingFPRTest(test.TestCase):
   def testZeroFalsePositivesAndTrueNegativesGivesZeroFPR(self):
     predictions = array_ops.ones((1, 4))
     labels = array_ops.ones((1, 4))
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1521,8 +1471,7 @@ class StreamingFNRTest(test.TestCase):
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1541,8 +1490,7 @@ class StreamingFNRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(np_inputs)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1552,8 +1500,7 @@ class StreamingFNRTest(test.TestCase):
   def testSomeCorrect(self):
     predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
     labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1595,8 +1542,7 @@ class StreamingFNRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(1 - np_inputs)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1606,8 +1552,7 @@ class StreamingFNRTest(test.TestCase):
   def testZeroFalseNegativesAndTruePositivesGivesZeroFNR(self):
     predictions = array_ops.zeros((1, 4))
     labels = array_ops.zeros((1, 4))
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1944,16 +1889,17 @@ class StreamingAUCTest(test.TestCase):
         enqueue_ops[i].append(x_queue.enqueue(x_batches[i, :]))
       return x_queue.dequeue()
 
-    for weights in (None, np.ones(num_samples), np.random.exponential(
-        scale=1.0, size=num_samples)):
+    for weights in (None, np.ones(num_samples),
+                    np.random.exponential(scale=1.0, size=num_samples)):
       expected_auc = _np_auc(predictions, labels, weights)
 
       with self.test_session() as sess:
         enqueue_ops = [[] for i in range(num_batches)]
         tf_predictions = _enqueue_as_batches(predictions, enqueue_ops)
         tf_labels = _enqueue_as_batches(labels, enqueue_ops)
-        tf_weights = (_enqueue_as_batches(weights, enqueue_ops) if
-                      weights is not None else None)
+        tf_weights = (
+            _enqueue_as_batches(weights, enqueue_ops)
+            if weights is not None else None)
 
         for i in range(num_batches):
           sess.run(enqueue_ops[i])
@@ -1985,17 +1931,18 @@ class StreamingDynamicAUCTest(test.TestCase):
   def testUnknownCurve(self):
     with self.assertRaisesRegexp(
         ValueError, 'curve must be either ROC or PR, TEST_CURVE unknown'):
-      metrics.streaming_dynamic_auc(labels=array_ops.ones((10, 1)),
-                                    predictions=array_ops.ones((10, 1)),
-                                    curve='TEST_CURVE')
+      metrics.streaming_dynamic_auc(
+          labels=array_ops.ones((10, 1)),
+          predictions=array_ops.ones((10, 1)),
+          curve='TEST_CURVE')
 
   def testVars(self):
     metrics.streaming_dynamic_auc(
         labels=array_ops.ones((10, 1)), predictions=array_ops.ones((10, 1)))
-    _assert_metric_variables(self, ['dynamic_auc/concat_labels/array:0',
-                                    'dynamic_auc/concat_labels/size:0',
-                                    'dynamic_auc/concat_preds/array:0',
-                                    'dynamic_auc/concat_preds/size:0'])
+    _assert_metric_variables(self, [
+        'dynamic_auc/concat_labels/array:0', 'dynamic_auc/concat_labels/size:0',
+        'dynamic_auc/concat_preds/array:0', 'dynamic_auc/concat_preds/size:0'
+    ])
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2049,8 +1996,8 @@ class StreamingDynamicAUCTest(test.TestCase):
 
   def testNonZeroOnePredictions(self):
     with self.test_session() as sess:
-      predictions = constant_op.constant([2.5, -2.5, 2.5, -2.5],
-                                         dtype=dtypes_lib.float32)
+      predictions = constant_op.constant(
+          [2.5, -2.5, 2.5, -2.5], dtype=dtypes_lib.float32)
       labels = constant_op.constant([1, 0, 1, 0])
       auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
       sess.run(variables.local_variables_initializer())
@@ -2122,9 +2069,10 @@ class StreamingDynamicAUCTest(test.TestCase):
     num_batches = 100
     labels = np.array([])
     predictions = np.array([])
-    tf_labels = variables.Variable(array_ops.ones(batch_size, dtypes_lib.int32),
-                                   collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                                   dtype=dtypes_lib.int32)
+    tf_labels = variables.Variable(
+        array_ops.ones(batch_size, dtypes_lib.int32),
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        dtype=dtypes_lib.int32)
     tf_predictions = variables.Variable(
         array_ops.ones(batch_size),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -2195,8 +2143,7 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       gotten_result: A PrecisionRecallData object.
     """
     gotten_dict = {k: t.eval() for k, t in gotten_result._asdict().items()}
-    self.assertItemsEqual(
-        list(expected_dict.keys()), list(gotten_dict.keys()))
+    self.assertItemsEqual(list(expected_dict.keys()), list(gotten_dict.keys()))
 
     for key, expected_values in expected_dict.items():
       self.assertAllClose(expected_values, gotten_dict[key])
@@ -2261,60 +2208,65 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
         sess.run(update_op)
 
       # Then verify idempotency.
-      initial_result = {k: value.eval().tolist() for k, value in
-                        result._asdict().items()}
+      initial_result = {
+          k: value.eval().tolist()
+          for k, value in result._asdict().items()
+      }
       for _ in range(3):
         self._testResultsEqual(initial_result, result)
 
   def testAllTruePositives(self):
-    self._testCase([[1]], [[True]], {
-        'tp': [1, 1, 1],
-        'fp': [0, 0, 0],
-        'tn': [0, 0, 0],
-        'fn': [0, 0, 0],
-        'precision': [1.0, 1.0, 1.0],
-        'recall': [1.0, 1.0, 1.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[1]], [[True]], {
+            'tp': [1, 1, 1],
+            'fp': [0, 0, 0],
+            'tn': [0, 0, 0],
+            'fn': [0, 0, 0],
+            'precision': [1.0, 1.0, 1.0],
+            'recall': [1.0, 1.0, 1.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllTrueNegatives(self):
-    self._testCase([[0]], [[False]], {
-        'tp': [0, 0, 0],
-        'fp': [1, 0, 0],
-        'tn': [0, 1, 1],
-        'fn': [0, 0, 0],
-        'precision': [0.0, 0.0, 0.0],
-        'recall': [0.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[0]], [[False]], {
+            'tp': [0, 0, 0],
+            'fp': [1, 0, 0],
+            'tn': [0, 1, 1],
+            'fn': [0, 0, 0],
+            'precision': [0.0, 0.0, 0.0],
+            'recall': [0.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllFalsePositives(self):
-    self._testCase([[1]], [[False]], {
-        'tp': [0, 0, 0],
-        'fp': [1, 1, 1],
-        'tn': [0, 0, 0],
-        'fn': [0, 0, 0],
-        'precision': [0.0, 0.0, 0.0],
-        'recall': [0.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[1]], [[False]], {
+            'tp': [0, 0, 0],
+            'fp': [1, 1, 1],
+            'tn': [0, 0, 0],
+            'fn': [0, 0, 0],
+            'precision': [0.0, 0.0, 0.0],
+            'recall': [0.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllFalseNegatives(self):
-    self._testCase([[0]], [[True]], {
-        'tp': [1, 0, 0],
-        'fp': [0, 0, 0],
-        'tn': [0, 0, 0],
-        'fn': [0, 1, 1],
-        'precision': [1.0, 0.0, 0.0],
-        'recall': [1.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[0]], [[True]], {
+            'tp': [1, 0, 0],
+            'fp': [0, 0, 0],
+            'tn': [0, 0, 0],
+            'fn': [0, 1, 1],
+            'precision': [1.0, 0.0, 0.0],
+            'recall': [1.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testManyValues(self):
     self._testCase(
         [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
-        [[True, False, False, True, True, True]],
-        {
+        [[True, False, False, True, True, True]], {
             'tp': [4, 3, 0],
             'fp': [2, 0, 0],
             'tn': [0, 2, 2],
@@ -2327,8 +2279,7 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
   def testManyValuesWithWeights(self):
     self._testCase(
         [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
-        [[True, False, False, True, True, True]],
-        {
+        [[True, False, False, True, True, True]], {
             'tp': [1.5, 1.5, 0.0],
             'fp': [2.5, 0.0, 0.0],
             'tn': [0.0, 2.5, 2.5],
@@ -2644,11 +2595,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
     thresholds = [0, 0.5, 1.0]
-    prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                              labels,
-                                                              thresholds)
-    rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                         thresholds)
+    prec, prec_op = metrics.streaming_precision_at_thresholds(
+        predictions, labels, thresholds)
+    rec, rec_op = metrics.streaming_recall_at_thresholds(
+        predictions, labels, thresholds)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -2672,11 +2622,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
       labels = constant_op.constant(inputs)
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2690,11 +2639,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
           [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2709,11 +2657,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
       labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2779,11 +2726,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
           [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 1, 1, 1], shape=(1, 4))
       thresholds = [-1.0, 2.0]  # lower/higher than any values
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       prec_low = prec[0]
       prec_high = prec[1]
@@ -2803,11 +2749,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
       labels = array_ops.zeros([4])
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2872,12 +2817,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       tf_predictions = predictions_queue.dequeue()
       tf_labels = labels_queue.dequeue()
 
-      prec, prec_op = metrics.streaming_precision_at_thresholds(tf_predictions,
-                                                                tf_labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(tf_predictions,
-                                                           tf_labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          tf_predictions, tf_labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          tf_predictions, tf_labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       for _ in range(int(num_samples / batch_size)):
@@ -2921,8 +2864,7 @@ class StreamingFPRThresholdsTest(test.TestCase):
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0],
         updates_collections=[my_collection_name])
-    self.assertListEqual(
-        ops.get_collection(my_collection_name), [update_op])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
@@ -3271,8 +3213,7 @@ class StreamingFNRThresholdsTest(test.TestCase):
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0],
         updates_collections=[my_collection_name])
-    self.assertListEqual(
-        ops.get_collection(my_collection_name), [update_op])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
@@ -3492,8 +3433,7 @@ class StreamingRecallAtKTest(test.TestCase):
   def testVars(self):
     metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1)
     _assert_metric_variables(self,
                              ('recall_at_1/count:0', 'recall_at_1/total:0'))
@@ -3502,8 +3442,7 @@ class StreamingRecallAtKTest(test.TestCase):
     my_collection_name = '__metrics__'
     mean, _ = metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1,
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
@@ -3512,8 +3451,7 @@ class StreamingRecallAtKTest(test.TestCase):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1,
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -3715,9 +3653,17 @@ class StreamingSparsePrecisionTest(test.TestCase):
       # top_k_predictions has rank < 2.
       top_k_predictions = [9, 4, 6, 2, 0]
       sp_labels = sparse_tensor.SparseTensorValue(
-          indices=np.array([[0,], [1,], [2,]], np.int64),
+          indices=np.array([[
+              0,
+          ], [
+              1,
+          ], [
+              2,
+          ]], np.int64),
           values=np.array([2, 7, 8], np.int64),
-          dense_shape=np.array([10,], np.int64))
+          dense_shape=np.array([
+              10,
+          ], np.int64))
 
       with self.assertRaises(ValueError):
         precision, _ = metrics.streaming_sparse_precision_at_top_k(
@@ -3774,8 +3720,9 @@ class StreamingSparsePrecisionTest(test.TestCase):
     # average of the 2 examples.
     labels = np.array([labels_ex1, labels_ex2], dtype=np.int64)
     predictions = (predictions_ex1, predictions_ex2)
-    streaming_precision = [(ex1 + ex2) / 2
-                           for ex1, ex2 in zip(precision_ex1, precision_ex2)]
+    streaming_precision = [
+        (ex1 + ex2) / 2 for ex1, ex2 in zip(precision_ex1, precision_ex2)
+    ]
     streaming_average_precision = [
         (ex1 + ex2) / 2
         for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)
@@ -3835,29 +3782,29 @@ class StreamingSparsePrecisionTest(test.TestCase):
           (predictions_top_k_ex1[:k],), labels, expected=avg_precision_ex1[i])
 
   def test_average_precision_at_top_k_static_shape_check(self):
-    predictions_top_k = array_ops.placeholder(shape=(2, None),
-                                              dtype=dtypes_lib.int64)
+    predictions_top_k = array_ops.placeholder(
+        shape=(2, None), dtype=dtypes_lib.int64)
     labels = np.array(((1,), (2,)), dtype=np.int64)
     # Fails due to non-static predictions_idx shape.
     with self.assertRaises(ValueError):
-      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                             labels)
+      metric_ops.streaming_sparse_average_precision_at_top_k(
+          predictions_top_k, labels)
 
     predictions_top_k = (2, 1)
     # Fails since rank of predictions_idx is less than one.
     with self.assertRaises(ValueError):
-      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                             labels)
+      metric_ops.streaming_sparse_average_precision_at_top_k(
+          predictions_top_k, labels)
     predictions_top_k = ((2,), (1,))
     # Valid static shape.
-    metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                           labels)
+    metric_ops.streaming_sparse_average_precision_at_top_k(
+        predictions_top_k, labels)
 
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -3871,8 +3818,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -3971,8 +3918,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
         [5, 7, 2, 9, 6],
     ]
     sp_labels = sparse_tensor.SparseTensorValue(
-        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
-                 [1, 3]],
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2], [1,
+                                                                          3]],
         # values -1 and 10 are outside the [0, n_classes) range and are ignored.
         values=np.array([2, 7, -1, 8, 1, 2, 5, 10], np.int64),
         dense_shape=[2, 4])
@@ -4324,8 +4271,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
@@ -4340,8 +4287,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4354,8 +4301,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4374,8 +4321,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_weighted(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4647,8 +4594,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [5, 7, 2, 9, 6],
     ]
     sp_labels = sparse_tensor.SparseTensorValue(
-        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
-                 [1, 3]],
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2], [1,
+                                                                          3]],
         # values -1 and 10 are outside the [0, n_classes) range.
         values=np.array([2, 7, -1, 8, 1, 2, 5, 10], np.int64),
         dense_shape=[2, 4])
@@ -4661,10 +4608,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2,
         class_id=2)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=2.0 / 2,
-        class_id=2)
+        sp_labels, top_k_predictions, expected=2.0 / 2, class_id=2)
 
     # Class 5: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -4674,10 +4618,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 1,
         class_id=5)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=1.0 / 1,
-        class_id=5)
+        sp_labels, top_k_predictions, expected=1.0 / 1, class_id=5)
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -4687,10 +4628,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=0.0 / 1,
         class_id=7)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=0.0 / 1,
-        class_id=7)
+        sp_labels, top_k_predictions, expected=0.0 / 1, class_id=7)
 
     # All classes: 8 labels, 3 correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4740,10 +4678,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
     dense_labels = np.array(
         [[[2, 7, 8], [1, 2, 5]], [
             [1, 2, 5],
@@ -4771,10 +4707,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 4 labels, all correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4813,10 +4747,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     for class_id in xrange(10):
       self._test_streaming_sparse_recall_at_k(
@@ -4867,10 +4799,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4963,10 +4893,8 @@ class StreamingSparseRecallTest(test.TestCase):
         weights=[[0, 1], [0, 1]])
 
   def test_sparse_tensor_value(self):
-    predictions = [[0.1, 0.3, 0.2, 0.4],
-                   [0.1, 0.2, 0.3, 0.4]]
-    labels = [[0, 0, 1, 0],
-              [0, 0, 0, 1]]
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 1, 0], [0, 0, 0, 1]]
     expected_recall = 0.5
     with self.test_session():
       _, recall = metrics.streaming_sparse_recall_at_k(
@@ -5009,8 +4937,8 @@ class StreamingMeanAbsoluteErrorTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
-    error, update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                             labels)
+    error, update_op = metrics.streaming_mean_absolute_error(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5031,8 +4959,8 @@ class StreamingMeanAbsoluteErrorTest(test.TestCase):
         [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
     weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-    error, update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                             labels, weights)
+    error, update_op = metrics.streaming_mean_absolute_error(
+        predictions, labels, weights)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5075,8 +5003,8 @@ class StreamingMeanRelativeErrorTest(test.TestCase):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     normalizer = random_ops.random_normal((10, 3), seed=3)
-    error, update_op = metrics.streaming_mean_relative_error(predictions,
-                                                             labels, normalizer)
+    error, update_op = metrics.streaming_mean_relative_error(
+        predictions, labels, normalizer)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5200,8 +5128,8 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
         [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
     weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-    error, update_op = metrics.streaming_mean_squared_error(predictions, labels,
-                                                            weights)
+    error, update_op = metrics.streaming_mean_squared_error(
+        predictions, labels, weights)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5224,8 +5152,8 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
       _enqueue_vector(sess, labels_queue, [2, 4, 6])
       labels = labels_queue.dequeue()
 
-      error, update_op = metrics.streaming_mean_squared_error(predictions,
-                                                              labels)
+      error, update_op = metrics.streaming_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       sess.run(update_op)
@@ -5292,10 +5220,10 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
       _enqueue_vector(sess, labels_queue, [2, 4, 6])
       labels = labels_queue.dequeue()
 
-      mae, ma_update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                                labels)
-      mse, ms_update_op = metrics.streaming_mean_squared_error(predictions,
-                                                               labels)
+      mae, ma_update_op = metrics.streaming_mean_absolute_error(
+          predictions, labels)
+      mse, ms_update_op = metrics.streaming_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       sess.run([ma_update_op, ms_update_op])
@@ -5336,8 +5264,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
-    error, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                 labels)
+    error, update_op = metrics.streaming_root_mean_squared_error(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5357,8 +5285,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
           0.0, shape=(1, 3), dtype=dtypes_lib.float32)
       labels = constant_op.constant(0.0, shape=(1, 3), dtype=dtypes_lib.float32)
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       self.assertEqual(0, sess.run(update_op))
@@ -5372,8 +5300,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
       labels = constant_op.constant(
           [1, 3, 2], shape=(1, 3), dtype=dtypes_lib.float32)
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
@@ -5387,9 +5315,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
           [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
       weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels,
-                                                                  weights)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels, weights)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
@@ -5404,8 +5331,8 @@ class StreamingCovarianceTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
     _assert_metric_variables(self, (
         'covariance/comoment:0',
@@ -5417,8 +5344,8 @@ class StreamingCovarianceTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     cov, _ = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [cov])
@@ -5426,8 +5353,8 @@ class StreamingCovarianceTest(test.TestCase):
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -5487,9 +5414,8 @@ class StreamingCovarianceTest(test.TestCase):
       cov, update_op = metrics.streaming_covariance(
           predictions, labels, weights=weights)
 
-      expected_cov = np.cov([2, 4, 6, 8],
-                            [1, 3, 2, 7],
-                            fweights=[0, 1, 3, 1])[0, 1]
+      expected_cov = np.cov(
+          [2, 4, 6, 8], [1, 3, 2, 7], fweights=[0, 1, 3, 1])[0, 1]
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(expected_cov, sess.run(update_op))
       self.assertAlmostEqual(expected_cov, cov.eval())
@@ -5514,17 +5440,18 @@ class StreamingCovarianceTest(test.TestCase):
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_cov),
-                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_cov),
+            np.isnan(sess.run(cov, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_cov):
-          self.assertAlmostEqual(
-              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_cov,
+                                 sess.run(cov, feed_dict=feed_dict), 5)
         expected_cov = np.cov(predictions[:stride * (i + 1)],
                               labels[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_cov, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov, sess.run(cov, feed_dict=feed_dict),
+                               5)
         prev_expected_cov = expected_cov
 
   def testMultiUpdateWithErrorAndWeights(self):
@@ -5552,18 +5479,20 @@ class StreamingCovarianceTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_cov),
-                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_cov),
+            np.isnan(sess.run(cov, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_cov):
-          self.assertAlmostEqual(
-              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
-        expected_cov = np.cov(predictions[:stride * (i + 1)],
-                              labels[:stride * (i + 1)],
-                              fweights=weights[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_cov, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_cov,
+                                 sess.run(cov, feed_dict=feed_dict), 5)
+        expected_cov = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])[0, 1]
+        self.assertAlmostEqual(expected_cov,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov, sess.run(cov, feed_dict=feed_dict),
+                               5)
         prev_expected_cov = expected_cov
 
 
@@ -5574,8 +5503,8 @@ class StreamingPearsonRTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
     _assert_metric_variables(self, (
         'pearson_r/covariance/comoment:0',
@@ -5595,8 +5524,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     pearson_r, _ = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [pearson_r])
@@ -5604,8 +5533,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -5613,8 +5542,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     labels = random_ops.random_normal((10, 3), seed=2)
     predictions = labels * 0.5 + random_ops.random_normal((10, 3), seed=1) * 0.5
-    pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                 labels)
+    pearson_r, update_op = metrics.streaming_pearson_correlation(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5633,8 +5562,8 @@ class StreamingPearsonRTest(test.TestCase):
       predictions = math_ops.to_float(math_ops.range(10))
       labels = math_ops.to_float(math_ops.range(10))
 
-      pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                   labels)
+      pearson_r, update_op = metrics.streaming_pearson_correlation(
+          predictions, labels)
 
       expected_r = np.corrcoef(np.arange(10), np.arange(10))[0, 1]
       sess.run(variables.local_variables_initializer())
@@ -5648,8 +5577,8 @@ class StreamingPearsonRTest(test.TestCase):
       labels = constant_op.constant(
           [1, 3, 2], shape=(1, 3), dtype=dtypes_lib.float32)
 
-      pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                   labels)
+      pearson_r, update_op = metrics.streaming_pearson_correlation(
+          predictions, labels)
 
       expected_r = np.corrcoef([2, 4, 6], [1, 3, 2])[0, 1]
       sess.run(variables.local_variables_initializer())
@@ -5698,17 +5627,18 @@ class StreamingPearsonRTest(test.TestCase):
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_r):
-          self.assertAlmostEqual(
-              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
         expected_r = np.corrcoef(predictions[:stride * (i + 1)],
                                  labels[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_r, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(pearson_r, feed_dict=feed_dict), 5)
         prev_expected_r = expected_r
 
   def testMultiUpdateWithErrorAndWeights(self):
@@ -5736,19 +5666,21 @@ class StreamingPearsonRTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_r):
-          self.assertAlmostEqual(
-              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
-        cmat = np.cov(predictions[:stride * (i + 1)],
-                      labels[:stride * (i + 1)],
-                      fweights=weights[:stride * (i + 1)])
+          self.assertAlmostEqual(prev_expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
+        cmat = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])
         expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
-        self.assertAlmostEqual(
-            expected_r, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(pearson_r, feed_dict=feed_dict), 5)
         prev_expected_r = expected_r
 
   def testMultiUpdateWithErrorAndSingletonBatches(self):
@@ -5758,7 +5690,7 @@ class StreamingPearsonRTest(test.TestCase):
       predictions = np.random.randn(n)
       labels = 0.5 * predictions + np.random.randn(n)
       stride = 10
-      weights = (np.arange(n).reshape(n//stride, stride) % stride == 0)
+      weights = (np.arange(n).reshape(n // stride, stride) % stride == 0)
       for row in weights:
         np.random.shuffle(row)
       # Now, weights is one-hot by row - one item per batch has non-zero weight.
@@ -5778,19 +5710,20 @@ class StreamingPearsonRTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        cmat = np.cov(predictions[:stride * (i + 1)],
-                      labels[:stride * (i + 1)],
-                      fweights=weights[:stride * (i + 1)])
+        cmat = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])
         expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
         actual_r = sess.run(update_op, feed_dict=feed_dict)
         self.assertEqual(np.isnan(expected_r), np.isnan(actual_r))
-        self.assertEqual(np.isnan(expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(expected_r):
-          self.assertAlmostEqual(
-              expected_r, actual_r, 5)
-          self.assertAlmostEqual(
-              expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(expected_r, actual_r, 5)
+          self.assertAlmostEqual(expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
 
 
 class StreamingMeanCosineDistanceTest(test.TestCase):
@@ -6191,20 +6124,14 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertAlmostEqual(desired_output, miou.eval())
 
   def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
-    predictions = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[5]), constant_op.constant(
-                    1, shape=[5])
-        ],
-        0)
-    labels = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[3]), constant_op.constant(
-                    1, shape=[7])
-        ],
-        0)
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]),
+        constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]),
+        constant_op.constant(1, shape=[7])
+    ], 0)
     num_classes = 2
     with self.test_session() as sess:
       miou, update_op = metrics.streaming_mean_iou(predictions, labels,
@@ -6238,29 +6165,20 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertEqual(0., miou.eval())
 
   def testResultsWithSomeMissing(self):
-    predictions = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[5]), constant_op.constant(
-                    1, shape=[5])
-        ],
-        0)
-    labels = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[3]), constant_op.constant(
-                    1, shape=[7])
-        ],
-        0)
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]),
+        constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]),
+        constant_op.constant(1, shape=[7])
+    ], 0)
     num_classes = 2
-    weights = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[1]), constant_op.constant(
-                    1, shape=[8]), constant_op.constant(
-                        0, shape=[1])
-        ],
-        0)
+    weights = array_ops.concat([
+        constant_op.constant(0, shape=[1]),
+        constant_op.constant(1, shape=[8]),
+        constant_op.constant(0, shape=[1])
+    ], 0)
     with self.test_session() as sess:
       miou, update_op = metrics.streaming_mean_iou(
           predictions, labels, num_classes, weights=weights)
@@ -6270,56 +6188,45 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertAlmostEqual(desired_miou, miou.eval())
 
   def testMissingClassInLabels(self):
-    labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
-    predictions = constant_op.constant([
-      [[0, 0, 2, 1, 1, 0],
-       [0, 1, 2, 2, 0, 1]],
-      [[0, 0, 2, 1, 1, 1],
-       [1, 1, 2, 0, 0, 0]]])
+    labels = constant_op.constant([[[0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 1]],
+                                   [[1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0]]])
+    predictions = constant_op.constant(
+        [[[0, 0, 2, 1, 1, 0], [0, 1, 2, 2, 0, 1]], [[0, 0, 2, 1, 1, 1],
+                                                    [1, 1, 2, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
-      self.assertAlmostEqual(
-        1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
-        miou.eval())
+      self.assertAlmostEqual(1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 /
+                                      (0 + 5 + 0)), miou.eval())
 
   def testMissingClassOverallSmall(self):
     labels = constant_op.constant([0])
     predictions = constant_op.constant([0])
     num_classes = 2
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
       self.assertAlmostEqual(1, miou.eval())
 
   def testMissingClassOverallLarge(self):
-    labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
-    predictions = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 1, 0, 0, 1, 1]],
-      [[0, 0, 0, 1, 1, 1],
-       [1, 1, 1, 0, 0, 0]]])
+    labels = constant_op.constant([[[0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 1]],
+                                   [[1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0]]])
+    predictions = constant_op.constant(
+        [[[0, 0, 1, 1, 0, 0], [1, 1, 0, 0, 1, 1]], [[0, 0, 0, 1, 1, 1],
+                                                    [1, 1, 1, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
-      self.assertAlmostEqual(
-        1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
+      self.assertAlmostEqual(1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)),
+                             miou.eval())
 
 
 class StreamingConcatTest(test.TestCase):
@@ -6683,7 +6590,8 @@ class CohenKappaTest(test.TestCase):
     _assert_metric_variables(self, (
         'cohen_kappa/po:0',
         'cohen_kappa/pe_row:0',
-        'cohen_kappa/pe_col:0',))
+        'cohen_kappa/pe_col:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -6705,9 +6613,9 @@ class CohenKappaTest(test.TestCase):
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
-      (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=1)
+        (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
-      (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=2)
+        (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=2)
     kappa, update_op = metrics.cohen_kappa(labels, predictions, 3)
 
     with self.test_session() as sess:
@@ -6723,10 +6631,7 @@ class CohenKappaTest(test.TestCase):
         self.assertAlmostEqual(initial_kappa, kappa.eval(), 5)
 
   def testBasic(self):
-    confusion_matrix = np.array([
-      [9, 3, 1],
-      [4, 8, 2],
-      [2, 1, 6]])
+    confusion_matrix = np.array([[9, 3, 1], [4, 8, 2], [2, 1, 6]])
     # overall total = 36
     # po = [9, 8, 6], sum(po) = 23
     # pe_row = [15, 12, 9], pe_col = [13, 14, 9], so pe = [5.42, 4.67, 2.25]
@@ -6738,8 +6643,10 @@ class CohenKappaTest(test.TestCase):
     labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
 
     dtypes = [dtypes_lib.int16, dtypes_lib.int32, dtypes_lib.int64]
-    shapes = [(len(labels,)),  # 1-dim
-              (len(labels), 1)]  # 2-dim
+    shapes = [
+        (len(labels,)),  # 1-dim
+        (len(labels), 1)
+    ]  # 2-dim
     weights = [None, np.ones_like(labels)]
 
     for dtype in dtypes:
@@ -6795,10 +6702,7 @@ class CohenKappaTest(test.TestCase):
       self.assertAlmostEqual(expect, kappa.eval(), 5)
 
   def testWeighted(self):
-    confusion_matrix = np.array([
-      [9, 3, 1],
-      [4, 8, 2],
-      [2, 1, 6]])
+    confusion_matrix = np.array([[9, 3, 1], [4, 8, 2], [2, 1, 6]])
     labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
     num_samples = np.sum(confusion_matrix, dtype=np.int32)
     weights = (np.arange(0, num_samples) % 5) / 5.0
@@ -6809,31 +6713,26 @@ class CohenKappaTest(test.TestCase):
     with self.test_session() as sess:
       predictions = constant_op.constant(predictions, dtype=dtypes_lib.float32)
       labels = constant_op.constant(labels)
-      kappa, update_op = metrics.cohen_kappa(labels, predictions, 4,
-                                             weights=weights)
+      kappa, update_op = metrics.cohen_kappa(
+          labels, predictions, 4, weights=weights)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(expect, sess.run(update_op), 5)
       self.assertAlmostEqual(expect, kappa.eval(), 5)
 
   def testWithMultipleUpdates(self):
-    confusion_matrix = np.array([
-      [90, 30, 10, 20],
-      [40, 80, 20, 30],
-      [20, 10, 60, 35],
-      [15, 25, 30, 25]])
+    confusion_matrix = np.array([[90, 30, 10, 20], [40, 80, 20, 30],
+                                 [20, 10, 60, 35], [15, 25, 30, 25]])
     labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
     num_samples = np.sum(confusion_matrix, dtype=np.int32)
     weights = (np.arange(0, num_samples) % 5) / 5.0
     num_classes = confusion_matrix.shape[0]
 
     batch_size = num_samples // 10
-    predictions_t = array_ops.placeholder(dtypes_lib.float32,
-                                          shape=(batch_size,))
-    labels_t = array_ops.placeholder(dtypes_lib.int32,
-                                     shape=(batch_size,))
-    weights_t = array_ops.placeholder(dtypes_lib.float32,
-                                      shape=(batch_size,))
+    predictions_t = array_ops.placeholder(
+        dtypes_lib.float32, shape=(batch_size,))
+    labels_t = array_ops.placeholder(dtypes_lib.int32, shape=(batch_size,))
+    weights_t = array_ops.placeholder(dtypes_lib.float32, shape=(batch_size,))
     kappa, update_op = metrics.cohen_kappa(
         labels_t, predictions_t, num_classes, weights=weights_t)
     with self.test_session() as sess:
@@ -6841,10 +6740,13 @@ class CohenKappaTest(test.TestCase):
 
       for idx in range(0, num_samples, batch_size):
         batch_start, batch_end = idx, idx + batch_size
-        sess.run(update_op,
-                 feed_dict={labels_t: labels[batch_start:batch_end],
-                            predictions_t: predictions[batch_start:batch_end],
-                            weights_t: weights[batch_start:batch_end]})
+        sess.run(
+            update_op,
+            feed_dict={
+                labels_t: labels[batch_start:batch_end],
+                predictions_t: predictions[batch_start:batch_end],
+                weights_t: weights[batch_start:batch_end]
+            })
       # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(
       #                          labels_np, predictions_np, sample_weight=weights_np)
       expect = 0.289965397924
@@ -6862,7 +6764,8 @@ class CohenKappaTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.cohen_kappa(invalid_labels, predictions, 3)
 
-    invalid_predictions = array_ops.placeholder(dtypes_lib.float32, shape=(4, 2))
+    invalid_predictions = array_ops.placeholder(
+        dtypes_lib.float32, shape=(4, 2))
     labels = array_ops.placeholder(dtypes_lib.int32, shape=(4, 1))
     with self.assertRaises(ValueError):
       metrics.cohen_kappa(labels, invalid_predictions, 3)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
index d07fece4bc668612d517e8dcaab1a35451a0238e..6a3b535eb447dd80f8e39d1d005f8f1d4f503549 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
@@ -58,6 +58,7 @@ def read_cifar10(filename_queue):
 
   class CIFAR10Record(object):
     pass
+
   result = CIFAR10Record()
 
   # Dimensions of the images in the CIFAR-10 dataset.
@@ -147,8 +148,9 @@ def distorted_inputs(data_dir, batch_size):
     images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
     labels: Labels. 1D tensor of [batch_size] size.
   """
-  filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-               for i in xrange(1, 6)]
+  filenames = [
+      os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+  ]
   for f in filenames:
     if not tf.gfile.Exists(f):
       raise ValueError('Failed to find file: ' + f)
@@ -174,10 +176,9 @@ def distorted_inputs(data_dir, batch_size):
 
   # Because these operations are not commutative, consider randomizing
   # the order their operation.
-  distorted_image = tf.image.random_brightness(distorted_image,
-                                               max_delta=63)
-  distorted_image = tf.image.random_contrast(distorted_image,
-                                             lower=0.2, upper=1.8)
+  distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
+  distorted_image = tf.image.random_contrast(
+      distorted_image, lower=0.2, upper=1.8)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(distorted_image)
@@ -188,15 +189,18 @@ def distorted_inputs(data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
-                           min_fraction_of_examples_in_queue)
-  print ('Filling queue with %d CIFAR images before starting to train. '
-         'This will take a few minutes.' % min_queue_examples)
+  min_queue_examples = int(
+      NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue)
+  print('Filling queue with %d CIFAR images before starting to train. '
+        'This will take a few minutes.' % min_queue_examples)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=True)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=True)
 
 
 def inputs(eval_data, data_dir, batch_size):
@@ -212,8 +216,9 @@ def inputs(eval_data, data_dir, batch_size):
     labels: Labels. 1D tensor of [batch_size] size.
   """
   if not eval_data:
-    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-                 for i in xrange(1, 6)]
+    filenames = [
+        os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+    ]
     num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
   else:
     filenames = [os.path.join(data_dir, 'test_batch.bin')]
@@ -235,8 +240,8 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Image processing for evaluation.
   # Crop the central [height, width] of the image.
-  resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image,
-                                                         width, height)
+  resized_image = tf.image.resize_image_with_crop_or_pad(
+      reshaped_image, width, height)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(resized_image)
@@ -247,10 +252,13 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(num_examples_per_epoch *
-                           min_fraction_of_examples_in_queue)
+  min_queue_examples = int(
+      num_examples_per_epoch * min_fraction_of_examples_in_queue)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=False)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=False)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
index 0d1de869f6ef91791a235cfe545b3b3a9b734e72..660f0168b10aa1e5b320cb476b051918804d2bde 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
@@ -48,16 +48,16 @@ from tensorflow.contrib.model_pruning.python import pruning
 # Global constants describing the CIFAR-10 data set.
 IMAGE_SIZE = cifar10_input.IMAGE_SIZE
 NUM_CLASSES = cifar10_input.NUM_CLASSES
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN  # pylint: disable=line-too-long
 NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
 BATCH_SIZE = 128
 DATA_DIR = '/tmp/cifar10_data'
 
 # Constants describing the training process.
-MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
-NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
+MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
+NUM_EPOCHS_PER_DECAY = 350.0  # Epochs after which learning rate decays.
 LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
-INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.
+INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.
 
 # If a model is trained with multiple GPUs, prefix all Op names with tower_name
 # to differentiate the operations. Note that this prefix is removed from the
@@ -82,8 +82,7 @@ def _activation_summary(x):
   # session. This helps the clarity of presentation on tensorboard.
   tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
   tf.summary.histogram(tensor_name + '/activations', x)
-  tf.summary.scalar(tensor_name + '/sparsity',
-                                       tf.nn.zero_fraction(x))
+  tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
 
 
 def _variable_on_cpu(name, shape, initializer):
@@ -120,10 +119,9 @@ def _variable_with_weight_decay(name, shape, stddev, wd):
     Variable Tensor
   """
   dtype = tf.float32
-  var = _variable_on_cpu(
-      name,
-      shape,
-      tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
+  var = _variable_on_cpu(name, shape,
+                         tf.truncated_normal_initializer(
+                             stddev=stddev, dtype=dtype))
   if wd is not None:
     weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
     tf.add_to_collection('losses', weight_decay)
@@ -188,10 +186,8 @@ def inference(images):
   # Note that the masks are applied only to the weight tensors
   # conv1
   with tf.variable_scope('conv1') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 3, 64],
-                                         stddev=5e-2,
-                                         wd=0.0)
+    kernel = _variable_with_weight_decay(
+        'weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=0.0)
 
     conv = tf.nn.conv2d(
         images, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
@@ -201,18 +197,20 @@ def inference(images):
     _activation_summary(conv1)
 
   # pool1
-  pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
-                         padding='SAME', name='pool1')
+  pool1 = tf.nn.max_pool(
+      conv1,
+      ksize=[1, 3, 3, 1],
+      strides=[1, 2, 2, 1],
+      padding='SAME',
+      name='pool1')
   # norm1
-  norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm1')
+  norm1 = tf.nn.lrn(
+      pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1')
 
   # conv2
   with tf.variable_scope('conv2') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 64, 64],
-                                         stddev=5e-2,
-                                         wd=0.0)
+    kernel = _variable_with_weight_decay(
+        'weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=0.0)
     conv = tf.nn.conv2d(
         norm1, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
     biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
@@ -221,19 +219,23 @@ def inference(images):
     _activation_summary(conv2)
 
   # norm2
-  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm2')
+  norm2 = tf.nn.lrn(
+      conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2')
   # pool2
-  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
-                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')
+  pool2 = tf.nn.max_pool(
+      norm2,
+      ksize=[1, 3, 3, 1],
+      strides=[1, 2, 2, 1],
+      padding='SAME',
+      name='pool2')
 
   # local3
   with tf.variable_scope('local3') as scope:
     # Move everything into depth so we can perform a single matrix multiply.
     reshape = tf.reshape(pool2, [BATCH_SIZE, -1])
     dim = reshape.get_shape()[1].value
-    weights = _variable_with_weight_decay('weights', shape=[dim, 384],
-                                          stddev=0.04, wd=0.004)
+    weights = _variable_with_weight_decay(
+        'weights', shape=[dim, 384], stddev=0.04, wd=0.004)
     biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
     local3 = tf.nn.relu(
         tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases,
@@ -242,8 +244,8 @@ def inference(images):
 
   # local4
   with tf.variable_scope('local4') as scope:
-    weights = _variable_with_weight_decay('weights', shape=[384, 192],
-                                          stddev=0.04, wd=0.004)
+    weights = _variable_with_weight_decay(
+        'weights', shape=[384, 192], stddev=0.04, wd=0.004)
     biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
     local4 = tf.nn.relu(
         tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases,
@@ -255,8 +257,8 @@ def inference(images):
   # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
   # and performs the softmax internally for efficiency.
   with tf.variable_scope('softmax_linear') as scope:
-    weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
-                                          stddev=1/192.0, wd=0.0)
+    weights = _variable_with_weight_decay(
+        'weights', [192, NUM_CLASSES], stddev=1 / 192.0, wd=0.0)
     biases = _variable_on_cpu('biases', [NUM_CLASSES],
                               tf.constant_initializer(0.0))
     softmax_linear = tf.add(
@@ -337,11 +339,12 @@ def train(total_loss, global_step):
   decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
 
   # Decay the learning rate exponentially based on the number of steps.
-  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
-                                  global_step,
-                                  decay_steps,
-                                  LEARNING_RATE_DECAY_FACTOR,
-                                  staircase=True)
+  lr = tf.train.exponential_decay(
+      INITIAL_LEARNING_RATE,
+      global_step,
+      decay_steps,
+      LEARNING_RATE_DECAY_FACTOR,
+      staircase=True)
   tf.summary.scalar('learning_rate', lr)
 
   # Generate moving averages of all losses and associated summaries.
@@ -365,8 +368,8 @@ def train(total_loss, global_step):
       tf.summary.histogram(var.op.name + '/gradients', grad)
 
   # Track the moving averages of all trainable variables.
-  variable_averages = tf.train.ExponentialMovingAverage(
-      MOVING_AVERAGE_DECAY, global_step)
+  variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,
+                                                        global_step)
   variables_averages_op = variable_averages.apply(tf.trainable_variables())
 
   with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
@@ -383,10 +386,13 @@ def maybe_download_and_extract():
   filename = DATA_URL.split('/')[-1]
   filepath = os.path.join(dest_directory, filename)
   if not os.path.exists(filepath):
+
     def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
-          float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.write('\r>> Downloading %s %.1f%%' %
+                       (filename,
+                        float(count * block_size) / float(total_size) * 100.0))
       sys.stdout.flush()
+
     filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
     print()
     statinfo = os.stat(filepath)
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index 8d14a3ef0404e727c47ad2ab39a69838fe1588aa..6a7f5efecdb4062874a09df227d139ad20d59f3f 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 
 namespace tensorflow {
 
@@ -62,7 +62,6 @@ BaseRemoteRendezvous* MPIRendezvousMgr::Create(int64 step_id,
 void MPIRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
-
   Status s = Status::OK();
   MPIRequestTensorCall* rendezvous_call = new MPIRequestTensorCall();
 
@@ -103,37 +102,37 @@ void MPIRemoteRendezvous::RecvFromRemoteAsync(
   // Create the function which is called when the Tensor is send by remote
   const int64 temp1 = step_id_;
   rendezvous_call->recv_call_ =
-      [this, parsed, recv_args, done, dst, temp1, rendezvous_call](
-          MPIRecvTensorResponse mpi_response) {
-    Status s;
-    Device* dst_device;
-    if (s.ok()) {
-      s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
-      CHECK(s.ok()) << "Device lookup failed";
-    }
-
-    VLOG(3) << "MPI Received tensor " << parsed.FullKey()
-            << " @ step: " << temp1
-            << " single-send: " << mpi_response.singlesend();
-
-    Tensor val;
-    if (mpi_response.singlesend()) {
-      dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
-                                      recv_args.alloc_attrs, &val);
-    } else {
-      TensorResponse tr;
-      tr.InitAlloc(dst_device, recv_args.alloc_attrs);
-      tr.InitPartial(mpi_response.response());
-      const size_t nBytes = tr.tensor().TotalBytes();
-      void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
-      MPI_Status status;
-      MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
-                         TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
-      val = std::move(tr.tensor());
-    }
-
-    done(s, Args(), recv_args, val, mpi_response.response().is_dead());
-  };
+      [this, parsed, recv_args, done, dst, temp1,
+       rendezvous_call](MPIRecvTensorResponse mpi_response) {
+        Status s;
+        Device* dst_device;
+        if (s.ok()) {
+          s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+          CHECK(s.ok()) << "Device lookup failed";
+        }
+
+        VLOG(3) << "MPI Received tensor " << parsed.FullKey()
+                << " @ step: " << temp1
+                << " single-send: " << mpi_response.singlesend();
+
+        Tensor val;
+        if (mpi_response.singlesend()) {
+          dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
+                                          recv_args.alloc_attrs, &val);
+        } else {
+          TensorResponse tr;
+          tr.InitAlloc(dst_device, recv_args.alloc_attrs);
+          tr.InitPartial(mpi_response.response());
+          const size_t nBytes = tr.tensor().TotalBytes();
+          void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
+          MPI_Status status;
+          MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
+                             TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
+          val = std::move(tr.tensor());
+        }
+
+        done(s, Args(), recv_args, val, mpi_response.response().is_dead());
+      };
 
   MPIRendezvousMgr* mgr =
       reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
@@ -152,16 +151,18 @@ MPIRemoteRendezvous::~MPIRemoteRendezvous() {}
 void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
                                   const int mpi_dst) {
   TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique(
-      req.request_id(), "RecvTensor (MPIRendezvousMgr)", req));
+      request.request_id(), "RecvTensor (MPIRendezvousMgr)", request));
   const int64 step_id = request.step_id();
   const std::string& key = request.rendezvous_key();
   Rendezvous::ParsedKey parsed;
   TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
 
   MPIRecvTensorCallBack send_cb = [this, mpi_dst, parsed](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead,
-      MPISendTensorCall* mpi_send_call) {
+                                      const Status& status,
+                                      const Rendezvous::Args& send_args,
+                                      const Rendezvous::Args& recv_args,
+                                      const Tensor& val, bool is_dead,
+                                      MPISendTensorCall* mpi_send_call) {
     // TODO(jbedorf) this should be a loop over max size
     CHECK(mpi_send_call->mRes_.ByteSize() < INT_MAX)
         << "Buffer too large for single transfer";
@@ -194,74 +195,78 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
   };
 
   // Wrapper around the read callback to place the callback on our queue
-  Rendezvous::DoneCallback done_cb = [this, parsed, step_id, send_cb](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
-    if (!status.ok()) {
-      CHECK(status.ok()) << "RecvLocalAsync was not ok, key: "
-                         << parsed.FullKey() << " step: " << step_id
-                         << " error message: " << status.error_message();
-      return;
-    }
-
-    VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
-            << " @ step: " << step_id << std::endl;
-
-    auto mpi_send_call = new MPISendTensorCall();
-    mpi_send_call->Init(parsed, step_id, is_dead);
-
-    Device* src_dev = nullptr;
-    Status s = this->worker_env_2->device_mgr->LookupDevice(parsed.src_device,
-                                                            &src_dev);
-    CHECK(s.ok()) << "src device not found";
-
-    // Control if shape and data should be send together or if we can optimize
-    // it in two different transfers, thereby reducing memory copies
-    bool doOptimalTransfer = true;
-    if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
-    if (val.TotalBytes() < 1024) doOptimalTransfer = false;
-
-    doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
-
-    if (doOptimalTransfer) {
-      // First send the Tensor description and in a follow up transfer the data
-      mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
-          val.dtype());
-      val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
-                              ->mutable_tensor()
-                              ->mutable_tensor_shape());
-      mpi_send_call->mRes_.set_singlesend(false);
-    } else {
-      // Send the Tensor description and data in a single transfer
-      if (src_dev->tensorflow_gpu_device_info() &&
-          (!send_args.alloc_attrs.on_host())) {
-        Notification n;
-        GPUUtil::SetProtoFromGPU(
-            val, src_dev, send_args.device_context,
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor(), is_dead,
-            [&n, &s](const Status& s_) {
-              s = s_;
-              n.Notify();
-            });
-        n.WaitForNotification();
-      } else {
-        val.AsProtoTensorContent(
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor());
-      }
-    }
-
-    std::function<MPISendTensorCall*()> res = std::bind(
-        send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
-
-    SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
-
-    this->QueueSendRequest(req);
-
-    // Wait for the notification that indicates the tensor has been
-    // successfully transmitted to the remote process. Only needed if we
-    // have not parsed the tensor to proto
-    if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
-  };  // done_cb
+  Rendezvous::DoneCallback done_cb =
+      [this, parsed, step_id, send_cb](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
+        if (!status.ok()) {
+          CHECK(status.ok())
+              << "RecvLocalAsync was not ok, key: " << parsed.FullKey()
+              << " step: " << step_id
+              << " error message: " << status.error_message();
+          return;
+        }
+
+        VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
+                << " @ step: " << step_id << std::endl;
+
+        auto mpi_send_call = new MPISendTensorCall();
+        mpi_send_call->Init(parsed, step_id, is_dead);
+
+        Device* src_dev = nullptr;
+        Status s = this->worker_env_2->device_mgr->LookupDevice(
+            parsed.src_device, &src_dev);
+        CHECK(s.ok()) << "src device not found";
+
+        // Control if shape and data should be send together or if we can
+        // optimize it in two different transfers, thereby reducing memory
+        // copies
+        bool doOptimalTransfer = true;
+        if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
+        if (val.TotalBytes() < 1024) doOptimalTransfer = false;
+
+        doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
+
+        if (doOptimalTransfer) {
+          // First send the Tensor description and in a follow up transfer the
+          // data
+          mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
+              val.dtype());
+          val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
+                                  ->mutable_tensor()
+                                  ->mutable_tensor_shape());
+          mpi_send_call->mRes_.set_singlesend(false);
+        } else {
+          // Send the Tensor description and data in a single transfer
+          if (src_dev->tensorflow_gpu_device_info() &&
+              (!send_args.alloc_attrs.on_host())) {
+            Notification n;
+            GPUUtil::SetProtoFromGPU(
+                val, src_dev, send_args.device_context,
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor(),
+                is_dead, [&n, &s](const Status& s_) {
+                  s = s_;
+                  n.Notify();
+                });
+            n.WaitForNotification();
+          } else {
+            val.AsProtoTensorContent(
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor());
+          }
+        }
+
+        std::function<MPISendTensorCall*()> res = std::bind(
+            send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
+
+        SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
+
+        this->QueueSendRequest(req);
+
+        // Wait for the notification that indicates the tensor has been
+        // successfully transmitted to the remote process. Only needed if we
+        // have not parsed the tensor to proto
+        if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
+      };  // done_cb
 
   worker_env_2->compute_pool->Schedule([this, step_id, parsed, done_cb]() {
     this->RecvLocalAsync(step_id, parsed, done_cb);
@@ -293,9 +298,8 @@ void MPIRendezvousMgr::MPIBackgroundThread() {
     }
 
     // Remove sends that have been completed
-    active_sends.remove_if([](std::unique_ptr<MPISendTensorCall>& i) {
-      return i->IsFinished();
-    });
+    active_sends.remove_if(
+        [](std::unique_ptr<MPISendTensorCall>& i) { return i->IsFinished(); });
 
     // send a Tensor request
     RequestQueueEntry req;
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
index ca42ee2f6d246f67f5c4c668fe27b16722bc6130..5596601ddb9846c0e4f5be4bf33114fc19c0a59d 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <queue>
-#include <thread>
 #include <list>
-#include <string>
-#include <memory>
 #include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <thread>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/contrib/mpi/mpi_msg.pb.h"
 #include "tensorflow/contrib/mpi/mpi_utils.h"
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -160,7 +161,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
  private:
   typedef std::function<MPISendTensorCall*(
       const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
-      const Tensor&, const bool, MPISendTensorCall*)> MPIRecvTensorCallBack;
+      const Tensor&, const bool, MPISendTensorCall*)>
+      MPIRecvTensorCallBack;
 
   typedef std::pair<std::string, std::function<void()>> RequestQueueEntry;
   typedef std::pair<std::string, std::function<MPISendTensorCall*()>>
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
index d585c0565eb234655e7a1bbc92df5741e18c8f33..a31fa9ce0b3110d875689d74a41ca9f9cc85f532 100644
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "grpc/support/alloc.h"
 
-#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
index 45e21f2b25ab4897641ffec776eb1b3c32ab9a2e..fa297c28cb47d43ba927ab941854bd472d90b465 100644
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <string>
 #include <map>
+#include <string>
 #include <vector>
 
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
index 2d5b98022c3aafb627e986a2764ee60184014945..8dca90a1e34d6a234c2b1479ca5594e88afcc194 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
@@ -35,8 +35,8 @@ limitations under the License.
 
 #define OMPI_SKIP_MPICXX
 #include "third_party/mpi/mpi.h"
-#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
 #include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
+#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
 
 /*
  * MPI Allreduce and Allgather Ops for TensorFlow.
diff --git a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
index f0a116239d6f4f7271c2a8f68806ff1ccaae80ae..2fbefef0d36f6a1507827427ebbafe5e81e35ea3 100644
--- a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
+++ b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
@@ -26,7 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 
 _mpi_ops_so = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_mpi_ops.so"))
+    resource_loader.get_path_to_datafile('_mpi_ops.so'))
+
 
 def size(name=None):
   """An op which returns the number of MPI processes.
@@ -120,15 +121,14 @@ def allgather(tensor, name=None):
   """
   # Specify that first allgather is to collect the tensor gather sizes,
   # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
-  my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
+  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
+  my_size = tf.slice(
+      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
   if name is None:
-    name = "allgather"
-  sizing_name = "{}_sizing".format(name)
+    name = 'allgather'
+  sizing_name = '{}_sizing'.format(name)
   sizes = gen_mpi_ops.mpi_allgather(my_size, sizes_flag, name=sizing_name)
   return gen_mpi_ops.mpi_allgather(tensor, sizes, name=name)
 
 
 ops.NotDifferentiable('MPIAllgather')
-
-
diff --git a/tensorflow/contrib/ndlstm/__init__.py b/tensorflow/contrib/ndlstm/__init__.py
index 52e83069cb0c68b510da46149248369dce376647..da89bb4ab605e1570a119a88d481f3ad3d1fee4c 100644
--- a/tensorflow/contrib/ndlstm/__init__.py
+++ b/tensorflow/contrib/ndlstm/__init__.py
@@ -16,3 +16,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+from tensorflow.contrib.ndlstm.python import lstm2d
+from tensorflow.contrib.ndlstm.python import lstm1d
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
index d3c3531f405a74d89ce736dae0134939e189f7ae..2e2e9086c00b3e7766678b5eb6dca47dc9a5ddcc 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d.py
@@ -22,7 +22,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
@@ -85,18 +84,11 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
     Output sequence (length, batch_size, noutput)
   """
   with variable_scope.variable_scope(scope, "SeqLstm", [inputs]):
-    # TODO(tmb) make batch size, sequence_length dynamic
-    # example: sequence_length = tf.shape(inputs)[0]
-    _, batch_size, _ = _shape(inputs)
-    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm_cell.state_size])
-    sequence_length = int(inputs.get_shape()[0])
-    sequence_lengths = math_ops.to_int64(
-        array_ops.fill([batch_size], sequence_length))
+    lstm_cell = rnn_cell.BasicLSTMCell(noutput)
     if reverse:
       inputs = array_ops.reverse_v2(inputs, [0])
     outputs, _ = rnn.dynamic_rnn(
-        lstm_cell, inputs, sequence_lengths, state, time_major=True)
+        lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
     if reverse:
       outputs = array_ops.reverse_v2(outputs, [0])
     return outputs
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
index 2b412fac9a621f01bd21c6b4391da3c462dd78b3..13db6f62f525b6318687e3bf4b6499eee2c61ea8 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
@@ -75,7 +75,8 @@ class HyperplaneLSHProbesOp : public OpKernel {
                                 num_hyperplanes_per_table, "."));
     OP_REQUIRES(context, num_hyperplanes_per_table <= 30,
                 InvalidArgument("Need num_hyperplanes_per_table <= 30, got ",
-                                num_hyperplanes_per_table, ". "
+                                num_hyperplanes_per_table,
+                                ". "
                                 "If you need more hyperplanes, change this Op"
                                 " to work for larger integer types (int64)."));
 
@@ -88,12 +89,13 @@ class HyperplaneLSHProbesOp : public OpKernel {
                 InvalidArgument("num_probes must be at least 1."));
 
     int expected_num_hyperplanes = num_tables * num_hyperplanes_per_table;
-    OP_REQUIRES(
-        context, products_tensor.dim_size(1) == expected_num_hyperplanes,
-        InvalidArgument("Expected number of hyperplanes is ",
-                        expected_num_hyperplanes, " but received ",
-                        products_tensor.dim_size(1), " inner products per "
-                        "point."));
+    OP_REQUIRES(context,
+                products_tensor.dim_size(1) == expected_num_hyperplanes,
+                InvalidArgument("Expected number of hyperplanes is ",
+                                expected_num_hyperplanes, " but received ",
+                                products_tensor.dim_size(1),
+                                " inner products per "
+                                "point."));
 
     auto products_eigen_tensor = products_tensor.matrix<CoordinateType>();
     ConstMatrixMap products_matrix(products_eigen_tensor.data(),
@@ -116,13 +118,11 @@ class HyperplaneLSHProbesOp : public OpKernel {
     // lschmidt's workstation.
     int64 cost_per_unit = 21 * num_hyperplanes_per_table * num_tables;
     if (num_probes > num_tables) {
-      cost_per_unit += 110 * num_hyperplanes_per_table
-          * (num_probes - num_tables);
+      cost_per_unit +=
+          110 * num_hyperplanes_per_table * (num_probes - num_tables);
     }
     context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
-        batch_size,
-        cost_per_unit,
-        [&](int64 start, int64 end) {
+        batch_size, cost_per_unit, [&](int64 start, int64 end) {
           HyperplaneMultiprobe<CoordinateType, int32> multiprobe(
               num_hyperplanes_per_table, num_tables);
 
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
index 6132cba1f5aecbafd8ca820ecda39355dd768847..716ee9cdf704a14a6e433c7f92ccb91739f70655 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Wrapper optimizer for Elastic Average SGD """
 from __future__ import absolute_import
 from __future__ import division
@@ -78,23 +77,24 @@ class ElasticAverageCustomGetter(object):
   def __call__(self, getter, name, trainable, collections, *args, **kwargs):
     if trainable:
       with ops.device(self._worker_device):
-        local_var = getter(name, trainable=True,
-                           collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                           *args, **kwargs)
+        local_var = getter(
+            name,
+            trainable=True,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            *args,
+            **kwargs)
       global_center_variable = variable_scope.variable(
-        name='%s/%s' %
-             (GLOBAL_VARIABLE_NAME,
-              name),
-        initial_value=local_var.initialized_value(),
-        trainable=False,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+          name='%s/%s' % (GLOBAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
 
       with ops.device(self._worker_device):
         local_center_variable = variable_scope.variable(
-          name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
-          initial_value=local_var.initialized_value(),
-          trainable=False,
-          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+            name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
+            initial_value=local_var.initialized_value(),
+            trainable=False,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
       self._local_map[local_var] = local_center_variable
       self._global_map[local_var] = global_center_variable
@@ -117,16 +117,15 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
   # Default value as paper described
   BETA = 0.9
 
-  def __init__(
-      self,
-      opt,
-      num_worker,
-      ea_custom_getter,
-      communication_period=10,
-      moving_rate=None,
-      rho=None,
-      use_locking=True,
-      name="ElasticAverageOptimizer"):
+  def __init__(self,
+               opt,
+               num_worker,
+               ea_custom_getter,
+               communication_period=10,
+               moving_rate=None,
+               rho=None,
+               use_locking=True,
+               name='ElasticAverageOptimizer'):
     """Construct a new gradient descent optimizer.
 
     Args:
@@ -160,13 +159,15 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
       self._rho = rho
 
     self._local_step = variable_scope.get_variable(
-      initializer=0,
-      trainable=False,
-      collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      name="local_step")
+        initializer=0,
+        trainable=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        name='local_step')
     self._opt._prepare()
 
-  def compute_gradients(self, loss, var_list=None,
+  def compute_gradients(self,
+                        loss,
+                        var_list=None,
                         gate_gradients=optimizer.Optimizer.GATE_OP,
                         aggregation_method=None,
                         colocate_gradients_with_ops=False,
@@ -204,16 +205,18 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     if not var_list:
       var_list = variables.trainable_variables()
 
-    elastic_difference = [math_ops.subtract(v, lv) for v, lv in zip(
-      variables.trainable_variables(),
-      [self._local_map[var] for var in var_list])]
+    elastic_difference = [
+        math_ops.subtract(v, lv)
+        for v, lv in zip(variables.trainable_variables(),
+                         [self._local_map[var] for var in var_list])
+    ]
 
     distance_loss = self._rho * math_ops.add_n(
-                      [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
+        [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
 
     total_loss = loss + distance_loss
-    return self._opt.compute_gradients(total_loss, var_list,
-                                       gate_gradients, aggregation_method,
+    return self._opt.compute_gradients(total_loss, var_list, gate_gradients,
+                                       aggregation_method,
                                        colocate_gradients_with_ops, grad_loss)
 
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
@@ -241,7 +244,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     apply_updates = self._opt.apply_gradients(grads_and_vars)
     with ops.control_dependencies([apply_updates]):
       local_update = state_ops.assign_add(
-        self._local_step, 1, name='local_step_update').op
+          self._local_step, 1, name='local_step_update').op
 
     # update global variables.
     def _Update_global_variables():
@@ -259,12 +262,16 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
             differences.append(math_ops.subtract(v, lv))
         for lvar, diff in zip(local_vars, differences):
           with ops.device(lvar.device):
-            update_ops.append(state_ops.assign_sub(lvar, math_ops.multiply(
-              self._moving_rate, diff)))
+            update_ops.append(
+                state_ops.assign_sub(lvar,
+                                     math_ops.multiply(self._moving_rate,
+                                                       diff)))
         for var, diff in zip(global_center_vars, differences):
           with ops.device(var.device):
-            update_ops.append(state_ops.assign_add(var, math_ops.multiply(
-              self._moving_rate, diff)))
+            update_ops.append(
+                state_ops.assign_add(var,
+                                     math_ops.multiply(self._moving_rate,
+                                                       diff)))
         if global_step:
           with ops.colocate_with(global_step):
             update_ops.append(state_ops.assign_add(global_step, 1))
@@ -272,10 +279,10 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
       return variable_update
 
     with ops.control_dependencies([local_update]):
-      condition = math_ops.equal(math_ops.mod(
-        self._local_step, self._period), 0)
+      condition = math_ops.equal(
+          math_ops.mod(self._local_step, self._period), 0)
       conditional_update = control_flow_ops.cond(
-        condition, _Update_global_variables, control_flow_ops.no_op)
+          condition, _Update_global_variables, control_flow_ops.no_op)
     return conditional_update
 
   def get_init_op(self, task_index):
@@ -285,10 +292,12 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     def _Add_sync_queues_and_barrier(enqueue_after_list):
       """Adds ops to enqueu on all worker queues"""
       sync_queues = [
-        data_flow_ops.FIFOQueue(self._num_worker, [dtypes.bool], shapes=[[]],
-                                shared_name='%s%s' % (
-                                  'variable_init_sync_queue', i)) for i in
-        range(self._num_worker)]
+          data_flow_ops.FIFOQueue(
+              self._num_worker, [dtypes.bool],
+              shapes=[[]],
+              shared_name='%s%s' % ('variable_init_sync_queue', i))
+          for i in range(self._num_worker)
+      ]
       queue_ops = []
       # For each other worker, add an entry in a queue
       token = constant_op.constant(False)
@@ -299,7 +308,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
           else:
             queue_ops.append(q.enqueue(token))
       queue_ops.append(
-        sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
+          sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
       return control_flow_ops.group(*queue_ops)
 
     init_ops = []
@@ -307,11 +316,10 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     global_center_vars = [self._global_map[var] for var in local_vars]
     local_center_vars = [self._local_map[var] for var in local_vars]
     if not (local_vars and global_center_vars and local_center_vars):
-      raise ValueError(
-        'The lists of local_variables, global_center_variables, '
-        'local_center_variables should not be empty  ')
-    for lvar, gc_var, lc_var in zip(
-        local_vars, global_center_vars, local_center_vars):
+      raise ValueError('The lists of local_variables, global_center_variables, '
+                       'local_center_variables should not be empty  ')
+    for lvar, gc_var, lc_var in zip(local_vars, global_center_vars,
+                                    local_center_vars):
       init_ops.append(state_ops.assign(lvar, gc_var))
       init_ops.append(state_ops.assign(lc_var, gc_var))
 
@@ -325,6 +333,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
 
 
 class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
+
   def __init__(self, ea_optimizer, is_chief, task_index):
     """Creates hook to handle ElasticAverageOptimizer initialization ops.
 
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
index 446e91018d477d75116f6b78a2443ed79ed3b3ef..37539b959959b5cf1f7b2c8e8d2b6b05191565ad 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -38,20 +38,20 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
   worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
   ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
   cluster_dict = {
-    "worker": ["localhost:%s" % port for port in worker_ports],
-    "ps": ["localhost:%s" % port for port in ps_ports]
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
   }
   cs = server_lib.ClusterSpec(cluster_dict)
 
   workers = [
-    server_lib.Server(
-      cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
-    for ix in range(num_workers)
+      server_lib.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)
   ]
   ps_servers = [
-    server_lib.Server(
-      cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
-    for ix in range(num_ps)
+      server_lib.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)
   ]
 
   return cluster_dict, workers, ps_servers
@@ -68,15 +68,14 @@ def _get_workers(num_workers, period, workers, moving_rate):
     is_chief = (worker_id == 0)
     with graph.as_default():
       worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
-      ea_coustom = ElasticAverageCustomGetter(
-        worker_device=worker_device)
-      with variable_scope.variable_scope('',
-                                         custom_getter=ea_coustom), ops.device(
-        device_setter.replica_device_setter(worker_device=worker_device,
-                                            ps_device="/job:ps/task:0/cpu:0",
-                                            ps_tasks=1)):
-        global_step = variables.Variable(0, name='global_step',
-                                         trainable=False)
+      ea_coustom = ElasticAverageCustomGetter(worker_device=worker_device)
+      with variable_scope.variable_scope(
+          "", custom_getter=ea_coustom), ops.device(
+              device_setter.replica_device_setter(
+                  worker_device=worker_device,
+                  ps_device="/job:ps/task:0/cpu:0",
+                  ps_tasks=1)):
+        global_step = variables.Variable(0, name="global_step", trainable=False)
         var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
         var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
 
@@ -86,21 +85,19 @@ def _get_workers(num_workers, period, workers, moving_rate):
 
         sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
         opt = ElasticAverageOptimizer(
-          opt=sgd_opt,
-          num_worker=num_workers,
-          moving_rate=moving_rate,
-          communication_period=period,
-          ea_custom_getter=ea_coustom
-        )
+            opt=sgd_opt,
+            num_worker=num_workers,
+            moving_rate=moving_rate,
+            communication_period=period,
+            ea_custom_getter=ea_coustom)
         train_op = [
-          opt.apply_gradients(
-            ([grads_0, var_0],
-             [grads_1, var_1]), global_step)
+            opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
+                                global_step)
         ]
         easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
       # Creates MonitoredSession
-      sess = training.MonitoredTrainingSession(workers[worker_id].target,
-                                               hooks=[easgd_hook])
+      sess = training.MonitoredTrainingSession(
+          workers[worker_id].target, hooks=[easgd_hook])
 
     sessions.append(sess)
     graphs.append(graph)
@@ -110,6 +107,7 @@ def _get_workers(num_workers, period, workers, moving_rate):
 
 
 class ElasticAverageOptimizerTest(test.TestCase):
+
   def _run(self, train_op, sess):
     sess.run(train_op)
 
@@ -117,15 +115,14 @@ class ElasticAverageOptimizerTest(test.TestCase):
     num_workers = 1
     communication_period = 2
     num_ps = 1
-    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
-                                               num_ps=num_ps)
+    cluster, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
 
-    sessions, graphs, train_ops = _get_workers(num_workers,
-                                               communication_period,
-                                               workers, 1.0)
+    sessions, graphs, train_ops = _get_workers(
+        num_workers, communication_period, workers, 1.0)
 
-    var_0 = graphs[0].get_tensor_by_name('v0:0')
-    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
     global_step = training_util.get_global_step(graphs[0])
     var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
     var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
@@ -166,18 +163,17 @@ class ElasticAverageOptimizerTest(test.TestCase):
     num_workers = 2
     communication_period = 1
     num_ps = 2
-    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
-                                               num_ps=num_ps)
+    cluster, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
 
-    sessions, graphs, train_ops = _get_workers(num_workers,
-                                               communication_period,
-                                               workers, 0.5)
+    sessions, graphs, train_ops = _get_workers(
+        num_workers, communication_period, workers, 0.5)
 
-    var_0 = graphs[0].get_tensor_by_name('v0:0')
-    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
 
-    var_0_1 = graphs[1].get_tensor_by_name('v0:0')
-    var_1_1 = graphs[1].get_tensor_by_name('v1:0')
+    var_0_1 = graphs[1].get_tensor_by_name("v0:0")
+    var_1_1 = graphs[1].get_tensor_by_name("v1:0")
 
     var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
     var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
@@ -201,25 +197,24 @@ class ElasticAverageOptimizerTest(test.TestCase):
 
   def testPS2TasksWithClusterSpecClass(self):
     cluster_spec = server_lib.ClusterSpec({
-      "ps": ["ps0:2222", "ps1:2222"],
-      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
-    ea_coustom = ElasticAverageCustomGetter(
-      worker_device="/job:worker/task:0")
+    ea_coustom = ElasticAverageCustomGetter(worker_device="/job:worker/task:0")
     from tensorflow.python.training import device_setter
     with ops.device(
         device_setter.replica_device_setter(cluster=cluster_spec,
                                             worker_device="/job:worker/task:0",
                                             ps_device="/job:ps")), \
-         variable_scope.variable_scope('', custom_getter=ea_coustom):
+         variable_scope.variable_scope("", custom_getter=ea_coustom):
       v = variable_scope.get_variable(initializer=[1, 2], name="v")
-      w = variable_scope.get_variable(initializer=[2, 1], name='w')
-      v_g, w_g = ea_coustom._global_map[v],ea_coustom._global_map[w]
+      w = variable_scope.get_variable(initializer=[2, 1], name="w")
+      v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w]
       self.assertDeviceEqual("/job:worker/task:0", v.device)
       self.assertDeviceEqual("job:ps/task:0", v_g.device)
       self.assertDeviceEqual("/job:worker/task:0", w.device)
       self.assertDeviceEqual("job:ps/task:1", w_g.device)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index f243317f1df2ec8d93d44ad534f3fa58527f3217..82ebca7f20306e5658c8321716e39f9c7f8b8970 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -397,10 +397,6 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
             'automatically and cannot be injected manually'.format(kwarg))
 
     minimize_kwargs.update(optimizer_kwargs)
-    if method == 'SLSQP':
-      # SLSQP doesn't support step callbacks. Obviate associated warning
-      # message.
-      del minimize_kwargs['callback']
 
     import scipy.optimize  # pylint: disable=g-import-not-at-top
     result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 0f597d0a246a53892d72939edd1499a86c01017d..953586ee70cd4137295dd254bfb2d37cab0bcfe4 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -299,6 +299,45 @@ class ScipyOptimizerInterfaceTest(TestCase):
       method = optimizer.optimizer_kwargs.get('method')
       self.assertEqual('SLSQP', method)
 
+  def test_callbacks(self):
+    vector_val = np.array([7., -2.], dtype=np.float32)
+    vector = variables.Variable(vector_val, 'vector')
+
+    minimum_location_val = np.arange(2)
+    minimum_location = constant_op.constant(
+        minimum_location_val, dtype=dtypes.float32)
+
+    loss = math_ops.reduce_sum(math_ops.square(vector - minimum_location)) / 2.
+    loss_val_first = ((vector_val - minimum_location_val)**2).sum() / 2.
+
+    optimizer = external_optimizer.ScipyOptimizerInterface(loss, method='SLSQP')
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      initial_vector_val = sess.run(vector)
+
+      extra_fetches = [loss]
+
+      step_callback = test.mock.Mock()
+      loss_callback = test.mock.Mock()
+
+      optimizer.minimize(
+          sess,
+          fetches=extra_fetches,
+          loss_callback=loss_callback,
+          step_callback=step_callback)
+
+      loss_val_last = sess.run(loss)
+
+      call_first = test.mock.call(loss_val_first)
+      call_last = test.mock.call(loss_val_last)
+      loss_calls = [call_first, call_last]
+      loss_callback.assert_has_calls(loss_calls, any_order=True)
+
+      args, _ = step_callback.call_args
+      self.assertAllClose(minimum_location_val, args[0])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index 9cee405cef25f54fd064f8002265c42016c4fa50..e18923c8aae74c66ce78f98eb5e615e99463af74 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -14,13 +14,12 @@
 // limitations under the License.
 // =============================================================================
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
-REGISTER_KERNEL_BUILDER(Name("PeriodicResample")
-                            .Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index ba410f025d497178cfc1666ae231e75bad55b05e..3ab588c45881c8f93b4c1bcdf7ccde39086a1ed7 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -118,9 +118,9 @@ template <class InputDataT,
           tensorflow::errors::InvalidArgument(
               "periodic_resample expects the size of non-adjustable "
               "dimensions be at least as large as size of input tensor."
-              " Dimension ", i, " input tensor has size ",
-              input_tensor_shape.dim_size(i), ", desired shape has size ",
-              desired_shape[i], "."));
+              " Dimension ",
+              i, " input tensor has size ", input_tensor_shape.dim_size(i),
+              ", desired shape has size ", desired_shape[i], "."));
 
       // target_dimensions[i] = desired_shape(i);
       target_dimensions[i] = desired_shape[i];
diff --git a/tensorflow/contrib/pi_examples/camera/camera.cc b/tensorflow/contrib/pi_examples/camera/camera.cc
index cb20661662922a0a160272fcd02a9fe6f9daf6fb..8110185ea8d4684fbc6b4ce54a9b192a020a97f8 100644
--- a/tensorflow/contrib/pi_examples/camera/camera.cc
+++ b/tensorflow/contrib/pi_examples/camera/camera.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #include <errno.h>
 #include <fcntl.h>
-#include <fstream>
 #include <libv4l2.h>
 #include <linux/videodev2.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/time.h>
 #include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fstream>
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Used to store the memory-mapped buffers we use for capture.
 struct CameraBuffer {
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index 0b18045789f3a87ceb228033407d6b696bdb33f6..c6935a093f728353caeeb79a9ed85c957d87f066 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -23,9 +23,9 @@ limitations under the License.
 //
 // Full build instructions are at tensorflow/contrib/pi_examples/README.md.
 
-#include <stdio.h>
 #include <jpeglib.h>
 #include <setjmp.h>
+#include <stdio.h>
 #include <fstream>
 #include <vector>
 
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
@@ -77,23 +77,22 @@ Status ReadLabelsFile(string file_name, std::vector<string>* result,
 // Error handling for JPEG decoding.
 void CatchError(j_common_ptr cinfo) {
   (*cinfo->err->output_message)(cinfo);
-  jmp_buf *jpeg_jmpbuf = reinterpret_cast<jmp_buf *>(cinfo->client_data);
+  jmp_buf* jpeg_jmpbuf = reinterpret_cast<jmp_buf*>(cinfo->client_data);
   jpeg_destroy(cinfo);
   longjmp(*jpeg_jmpbuf, 1);
 }
 
 // Decompresses a JPEG file from disk.
 Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
-		    int* width, int* height, int* channels) {
+                    int* width, int* height, int* channels) {
   struct jpeg_decompress_struct cinfo;
-  FILE * infile;
+  FILE* infile;
   JSAMPARRAY buffer;
   int row_stride;
 
   if ((infile = fopen(file_name.c_str(), "rb")) == NULL) {
     LOG(ERROR) << "Can't open " << file_name;
-    return tensorflow::errors::NotFound("JPEG file ", file_name,
-					" not found");
+    return tensorflow::errors::NotFound("JPEG file ", file_name, " not found");
   }
 
   struct jpeg_error_mgr jerr;
@@ -116,10 +115,11 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   data->resize((*height) * (*width) * (*channels));
 
   row_stride = cinfo.output_width * cinfo.output_components;
-  buffer = (*cinfo.mem->alloc_sarray)
-    ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
+  buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE,
+                                      row_stride, 1);
   while (cinfo.output_scanline < cinfo.output_height) {
-    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]);
+    tensorflow::uint8* row_address =
+        &((*data)[cinfo.output_scanline * row_stride]);
     jpeg_read_scanlines(&cinfo, buffer, 1);
     memcpy(row_address, buffer[0], row_stride);
   }
@@ -141,24 +141,25 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
   int image_height;
   int image_channels;
   TF_RETURN_IF_ERROR(LoadJpegFile(file_name, &image_data, &image_width,
-				  &image_height, &image_channels));
-  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height
-	    << "x" << image_channels;
+                                  &image_height, &image_channels));
+  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height << "x"
+            << image_channels;
   const int wanted_channels = 3;
   if (image_channels < wanted_channels) {
-    return tensorflow::errors::FailedPrecondition("Image needs to have at least ",
-						  wanted_channels, " but only has ",
-						  image_channels);
+    return tensorflow::errors::FailedPrecondition(
+        "Image needs to have at least ", wanted_channels, " but only has ",
+        image_channels);
   }
-  // In these loops, we convert the eight-bit data in the image into float, resize
-  // it using bilinear filtering, and scale it numerically to the float range that
-  // the model expects (given by input_mean and input_std).
+  // In these loops, we convert the eight-bit data in the image into float,
+  // resize it using bilinear filtering, and scale it numerically to the float
+  // range that the model expects (given by input_mean and input_std).
   tensorflow::Tensor image_tensor(
-      tensorflow::DT_FLOAT, tensorflow::TensorShape(
-      {1, wanted_height, wanted_width, wanted_channels}));
+      tensorflow::DT_FLOAT,
+      tensorflow::TensorShape(
+          {1, wanted_height, wanted_width, wanted_channels}));
   auto image_tensor_mapped = image_tensor.tensor<float, 4>();
   tensorflow::uint8* in = image_data.data();
-  float *out = image_tensor_mapped.data();
+  float* out = image_tensor_mapped.data();
   const size_t image_rowlen = image_width * image_channels;
   const float width_scale = static_cast<float>(image_width) / wanted_width;
   const float height_scale = static_cast<float>(image_height) / wanted_height;
@@ -166,35 +167,37 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
     const float in_y = y * height_scale;
     const int top_y_index = static_cast<int>(floorf(in_y));
     const int bottom_y_index =
-      std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
+        std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
     const float y_lerp = in_y - top_y_index;
     tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
     tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
-    float *out_row = out + (y * wanted_width * wanted_channels);
+    float* out_row = out + (y * wanted_width * wanted_channels);
     for (int x = 0; x < wanted_width; ++x) {
       const float in_x = x * width_scale;
       const int left_x_index = static_cast<int>(floorf(in_x));
       const int right_x_index =
-	std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
+          std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
       tensorflow::uint8* in_top_left_pixel =
-	in_top_row + (left_x_index * wanted_channels);
+          in_top_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_top_right_pixel =
-	in_top_row + (right_x_index * wanted_channels);
+          in_top_row + (right_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_left_pixel =
-	in_bottom_row + (left_x_index * wanted_channels);
+          in_bottom_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_right_pixel =
-	in_bottom_row + (right_x_index * wanted_channels);
+          in_bottom_row + (right_x_index * wanted_channels);
       const float x_lerp = in_x - left_x_index;
-      float *out_pixel = out_row + (x * wanted_channels);
+      float* out_pixel = out_row + (x * wanted_channels);
       for (int c = 0; c < wanted_channels; ++c) {
-	const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
-	const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
-	const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std);
-	const float bottom_right((in_bottom_right_pixel[c] - input_mean) / input_std);
-	const float top = top_left + (top_right - top_left) * x_lerp;
-	const float bottom =
-	  bottom_left + (bottom_right - bottom_left) * x_lerp;
-	out_pixel[c] = top + (bottom - top) * y_lerp;
+        const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
+        const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
+        const float bottom_left((in_bottom_left_pixel[c] - input_mean) /
+                                input_std);
+        const float bottom_right((in_bottom_right_pixel[c] - input_mean) /
+                                 input_std);
+        const float top = top_left + (top_right - top_left) * x_lerp;
+        const float bottom =
+            bottom_left + (bottom_right - bottom_left) * x_lerp;
+        out_pixel[c] = top + (bottom - top) * y_lerp;
       }
     }
   }
@@ -233,10 +236,10 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
     scores.push_back(std::pair<int, float>({i, unsorted_scores_flat(i)}));
   }
   std::sort(scores.begin(), scores.end(),
-	    [](const std::pair<int, float> &left,
-	       const std::pair<int, float> &right) {
-	      return left.second > right.second;
-	    });
+            [](const std::pair<int, float>& left,
+               const std::pair<int, float>& right) {
+              return left.second > right.second;
+            });
   scores.resize(how_many_labels);
   Tensor sorted_indices(tensorflow::DT_INT32, {scores.size()});
   Tensor sorted_scores(tensorflow::DT_FLOAT, {scores.size()});
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index e8443e718d1e81a88b752eb639dcee9c89aa56dc..578d9424b25dd38f1d77a267d1fdf1ff9ff2da88 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -50,8 +50,8 @@ class PredictorFactoriesTest(test.TestCase):
   def testFromContribEstimator(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
-    predictor_factories.from_contrib_estimator(estimator, input_fn,
-        output_alternative_key='sum')
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum')
 
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index 7358822ef5ca7dba87cc1046001aa7f07f45f845..479ea9becae47b99f64c60ff7e11e50c8ccde717 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -18,66 +18,13 @@ py_library(
     name = "py2tf",
     srcs = [
         "__init__.py",
-        "api.py",
-        "config.py",
-        "conversion.py",
-        "naming.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/convert",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/impl",
+        "//tensorflow/contrib/py2tf/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
 )
-
-# Separate target that allows access to internal symbols for testing.
-py_library(
-    name = "py2tf_internal",
-    srcs = [
-        "api.py",
-        "config.py",
-        "conversion.py",
-        "naming.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow/contrib/py2tf/convert",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
-        "@gast_archive//:gast",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "api_test",
-    srcs = ["api_test.py"],
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "conversion_test",
-    srcs = ["conversion_test.py"],
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-        "@gast_archive//:gast",
-    ],
-)
-
-py_test(
-    name = "naming_test",
-    srcs = ["naming_test.py"],
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-    ],
-)
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py
index d187da99e065cb2d31ae4e45a9570378f9d1bf27..0d51bf0bf24836c1c10cf356ac5c0af6325ff0f7 100644
--- a/tensorflow/contrib/py2tf/__init__.py
+++ b/tensorflow/contrib/py2tf/__init__.py
@@ -21,11 +21,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.api import to_code
-from tensorflow.contrib.py2tf.api import to_graph
+from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.py2tf.impl.api import convert
+from tensorflow.contrib.py2tf.impl.api import graph_ready
+from tensorflow.contrib.py2tf.impl.api import to_code
+from tensorflow.contrib.py2tf.impl.api import to_graph
 from tensorflow.python.util.all_util import remove_undocumented
 
-
-_allowed_symbols = ['to_graph', 'to_code']
+_allowed_symbols = ['to_graph', 'to_code', 'convert', 'graph_ready', 'utils']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/convert/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
similarity index 68%
rename from tensorflow/contrib/py2tf/convert/BUILD
rename to tensorflow/contrib/py2tf/converters/BUILD
index 050e2ef10802dff58ea59a1465bded4e39c1dac1..3853c60f997187a15be9afba6624b6efc5f4e04b 100644
--- a/tensorflow/contrib/py2tf/convert/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -15,8 +15,9 @@ filegroup(
 )
 
 py_library(
-    name = "convert",
+    name = "converters",
     srcs = [
+        "asserts.py",
         "break_canonicalization.py",
         "builtin_functions.py",
         "call_trees.py",
@@ -35,13 +36,39 @@ py_library(
     ],
 )
 
+py_library(
+    name = "test_lib",
+    srcs = [
+        "converter_test_base.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":converters",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/utils",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "asserts_test",
+    srcs = ["asserts_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "break_canonicalization_test",
     srcs = ["break_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -49,10 +76,21 @@ py_test(
 py_test(
     name = "call_trees_test",
     srcs = ["call_trees_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "decorators_test",
+    srcs = ["decorators_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -60,10 +98,10 @@ py_test(
 py_test(
     name = "continue_canonicalization_test",
     srcs = ["continue_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -71,10 +109,10 @@ py_test(
 py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -82,10 +120,10 @@ py_test(
 py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -94,9 +132,8 @@ py_test(
     name = "for_canonicalization_test",
     srcs = ["for_canonicalization_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -104,10 +141,10 @@ py_test(
 py_test(
     name = "logical_expressions_test",
     srcs = ["logical_expressions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -115,10 +152,10 @@ py_test(
 py_test(
     name = "print_functions_test",
     srcs = ["print_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
     ],
@@ -127,10 +164,10 @@ py_test(
 py_test(
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/convert/__init__.py b/tensorflow/contrib/py2tf/converters/__init__.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/__init__.py
rename to tensorflow/contrib/py2tf/converters/__init__.py
diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/py2tf/converters/asserts.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d6ee1d09829b538815dbb9794868c13f51578fc
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts.py
@@ -0,0 +1,53 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts Assert statements to their corresponding TF calls."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+
+
+class AssertsTransformer(transformer.Base):
+  """Transforms Print nodes to Call so they can be handled as functions."""
+
+  # pylint:disable=invalid-name
+
+  def visit_Assert(self, node):
+    self.generic_visit(node)
+
+    # Note: The lone tf.Assert call will be wrapped with control_dependencies
+    # by side_effect_guards.
+    template = """
+      tf.Assert(test, [tf.constant(msg)])
+    """
+
+    if node.msg is None:
+      return templates.replace(
+          template, test=node.test, msg=gast.Str('Assertion error'))
+    elif isinstance(node.msg, gast.Str):
+      return templates.replace(template, test=node.test, msg=node.msg)
+    else:
+      raise NotImplementedError('Can only convert string messages for now.')
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context):
+  return AssertsTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/asserts_test.py b/tensorflow/contrib/py2tf/converters/asserts_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6611f2777a93a7e819c8becfa06a09b27f4e6aaf
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts_test.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for asserts module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.converters import asserts
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.platform import test
+
+
+class AssertsTest(converter_test_base.TestCase):
+
+  def test_transform(self):
+
+    def test_fn(a):
+      assert a > 0
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = asserts.transform(node, self.ctx)
+
+    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/convert/break_canonicalization.py b/tensorflow/contrib/py2tf/converters/break_canonicalization.py
similarity index 86%
rename from tensorflow/contrib/py2tf/convert/break_canonicalization.py
rename to tensorflow/contrib/py2tf/converters/break_canonicalization.py
index ef585734454db1aa1ffdb798d93978fb09752f05..2ae65e3007466409433e9b4ea0081898907e19ac 100644
--- a/tensorflow/contrib/py2tf/convert/break_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/break_canonicalization.py
@@ -33,31 +33,25 @@ class BreakCanonicalizationTransformer(gast.NodeTransformer):
     self.break_uses = []
 
   def _create_break_check(self):
-
-    def template(var_name):
-      (not var_name)  # pylint:disable=pointless-statement
-
-    expr, = templates.replace(
-        template, var_name=gast.Name(self.break_uses[-1][1], None, None))
+    template = """
+      (not var_name)
+    """
+    expr, = templates.replace(template, var_name=self.break_uses[-1][1])
     return expr.value
 
   def _create_break_trigger(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = True
-
-    block = templates.replace(
-        template, var_name=gast.Name(self.break_uses[-1][1], None, None))
+    """
+    block = templates.replace(template, var_name=self.break_uses[-1][1])
     block.append(gast.Continue())
     return block
 
   def _create_break_init(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = False
-
-    assign, = templates.replace(
-        template, var_name=gast.Name(self.break_uses[-1][1], None, None))
+    """
+    assign, = templates.replace(template, var_name=self.break_uses[-1][1])
     return assign
 
   # TODO(mdan): Surely the transformer supports this better?
diff --git a/tensorflow/contrib/py2tf/convert/break_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
similarity index 84%
rename from tensorflow/contrib/py2tf/convert/break_canonicalization_test.py
rename to tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
index 23c4c4d3e23e3e8eaafbafe9166d8c9618701fa5..b5ba2ad923dfeb73b38169494f6c7ea16ee815f1 100644
--- a/tensorflow/contrib/py2tf/convert/break_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import break_canonicalization
-from tensorflow.contrib.py2tf.convert import control_flow
+from tensorflow.contrib.py2tf.converters import break_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
 
 
@@ -32,12 +31,7 @@ class TestNamer(control_flow.SymbolNamer):
     return name_root
 
 
-class BreakCanonicalizationTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    return node
+class BreakCanonicalizationTest(converter_test_base.TestCase):
 
   def test_basic_break(self):
 
@@ -50,7 +44,7 @@ class BreakCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = break_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -82,7 +76,7 @@ class BreakCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = break_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -110,7 +104,7 @@ class BreakCanonicalizationTest(test.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = break_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/builtin_functions.py b/tensorflow/contrib/py2tf/converters/builtin_functions.py
similarity index 93%
rename from tensorflow/contrib/py2tf/convert/builtin_functions.py
rename to tensorflow/contrib/py2tf/converters/builtin_functions.py
index b80c96c97ac0c55f449a83bd43f2b65cdbdba390..7f6b64a34c1b95f0dd6b92dbc587da672e6c9c28 100644
--- a/tensorflow/contrib/py2tf/convert/builtin_functions.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions.py
@@ -29,10 +29,9 @@ class BuiltinFunctionTransformer(gast.NodeTransformer):
   # TODO(mdan): Bring print_functions in here.
 
   def _convert_len(self, node):
-
-    def template(args):
-      tf.shape(args)[0]  # pylint:disable=undefined-variable,expression-not-assigned
-
+    template = """
+      tf.shape(args)[0]
+    """
     new_call = templates.replace(template, args=node.args)[0].value
     return new_call
 
diff --git a/tensorflow/contrib/py2tf/convert/builtin_functions_test.py b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
similarity index 68%
rename from tensorflow/contrib/py2tf/convert/builtin_functions_test.py
rename to tensorflow/contrib/py2tf/converters/builtin_functions_test.py
index ab02b362aa365ffa11b84a637740063d7df51e7d..b5358da6bc0be06ec1f59d0ef58d926289b5b78f 100644
--- a/tensorflow/contrib/py2tf/convert/builtin_functions_test.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
@@ -18,32 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import builtin_functions
+from tensorflow.contrib.py2tf.converters import builtin_functions
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class BuiltinFunctionsTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class BuiltinFunctionsTest(converter_test_base.TestCase):
 
   def test_len(self):
 
     def test_fn(a):
       return len(a)
 
-    node = self._parse_and_analyze(test_fn, {'len': len})
+    node = self.parse_and_analyze(test_fn, {'len': len})
     node = builtin_functions.transform(node)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', array_ops)
diff --git a/tensorflow/contrib/py2tf/convert/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
similarity index 61%
rename from tensorflow/contrib/py2tf/convert/call_trees.py
rename to tensorflow/contrib/py2tf/converters/call_trees.py
index df071f596fc31502a98182f27bb66c54f71d2572..4c238b7fb9e7bdb25b0ded9a4c4a6b13508f19c1 100644
--- a/tensorflow/contrib/py2tf/convert/call_trees.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees.py
@@ -29,46 +29,46 @@ import gast
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.python.util import tf_inspect
 
 
 class FunctionNamer(object):
   """Describes the interface for CallTreeTransformer's namer."""
 
   def compiled_function_name(self,
-                             original_name,
-                             live_object=None,
+                             original_fqn,
+                             live_entity=None,
                              owner_type=None):
     """Generate the name corresponding to the compiled version of a function.
 
     Args:
-      original_name: String
-      live_object: Callable, the actual target function, if known.
+      original_fqn: string or tuple(string)
+      live_entity: Callable, the actual target function, if known.
       owner_type: Optional object. If present, it indicates that the function is
           a member of the given type.
     Returns:
-      String.
+      string, bool
     """
     raise NotImplementedError()
 
-  def compiled_class_name(self, original_name, live_object=None):
+  def compiled_class_name(self, original_fqn, live_entity=None):
     """Generate the name corresponding to the compiled version of a class.
 
     Args:
-      original_name: String
-      live_object: The actual target class, if known.
+      original_fqn: string or tuple(string)
+      live_entity: The actual target class, if known.
     Returns:
-      String.
+      string
     """
     raise NotImplementedError()
 
 
-class CallTreeTransformer(gast.NodeTransformer):
+class CallTreeTransformer(transformer.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
-  def __init__(self, namer, namespace, uncompiled_modules,
-               nocompile_decorators):
-    self.namer = namer
-    self.namespace = namespace
+  def __init__(self, context, uncompiled_modules, nocompile_decorators):
+    super(CallTreeTransformer, self).__init__(context)
     self.uncompiled_modules = uncompiled_modules
     self.nocompile_decorators = nocompile_decorators
 
@@ -78,7 +78,7 @@ class CallTreeTransformer(gast.NodeTransformer):
     if isinstance(node, gast.Call):
       return self._resolve_name(node.func)
     if isinstance(node, gast.Name):
-      return self.namespace.get(node.id)
+      return self.context.namespace.get(node.id)
     if isinstance(node, gast.Attribute):
       parent = self._resolve_name(node.value)
       if parent is not None:
@@ -91,8 +91,12 @@ class CallTreeTransformer(gast.NodeTransformer):
     if anno.hasanno(node, 'live_val'):
       return anno.getanno(node, 'live_val')
     if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'):
-      member = getattr(anno.getanno(node, 'type'), node.attr)
-      return member
+      owner_type = anno.getanno(node, 'type')
+      if hasattr(owner_type, node.attr):
+        return getattr(owner_type, node.attr)
+      else:
+        raise ValueError('Type "%s" has not attribute "%s". Is it dynamic?' %
+                         (owner_type, node.attr))
     return None
 
   def _should_compile(self, node, fqn):
@@ -106,14 +110,14 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     # The decorators themselves are not to be converted.
     # If present, the decorators should appear as static functions.
-    target_obj = self._try_resolve_target(node.func)
-    if target_obj is not None:
+    target_entity = self._try_resolve_target(node.func)
+    if target_entity is not None:
       # This attribute is set by the decorator itself.
       # TODO(mdan): This may not play nicely with other wrapping decorators.
-      if hasattr(target_obj, '__pyct_is_compile_decorator'):
+      if hasattr(target_entity, '__pyct_is_compile_decorator'):
         return False
 
-      if target_obj in self.nocompile_decorators:
+      if target_entity in self.nocompile_decorators:
         return False
 
       # Inspect the target function decorators. If any include a @convert
@@ -122,7 +126,8 @@ class CallTreeTransformer(gast.NodeTransformer):
       # To parse and re-analize each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
-        target_node = parser.parse_object(target_obj).body[0]
+        target_node, _ = parser.parse_entity(target_entity)
+        target_node = target_node.body[0]
       except TypeError:
         # Functions whose source we cannot access are compilable (e.g. wrapped
         # to py_func).
@@ -136,89 +141,89 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     return True
 
+  def _determine_function_owner(self, m):
+    # TODO(mdan): The parent type should be known at analysis. Use that instead.
+    if hasattr(m, 'im_class'):  # Python 2
+      return m.im_class
+    if hasattr(m, '__qualname__'):  # Python 3
+      # Object attributes: should be bound to "self".
+      if hasattr(m, '__self__'):
+        return type(m.__self__)
+
+      # Class attributes: should have the owner name in their namespace.
+      qn = m.__qualname__.split('.')
+      if len(qn) < 2:
+        return None
+      owner_name, func_name = qn[-2:]
+      if func_name != m.__name__:
+        raise ValueError('Inconsistent names detected '
+                         '(__qualname__[1] = "%s", __name__ = "%s") for %s.' %
+                         (func_name, m.__name__, m))
+      if owner_name == '<locals>':
+        return None
+      if owner_name not in self.context.namespace:
+        raise ValueError(
+            'Could not resolve name "%s" while analyzing %s. Namespace:\n%s' %
+            (owner_name, m, self.context.namespace))
+      return self.context.namespace[owner_name]
+    return None
+
   def _rename_compilable_function(self, node):
     assert anno.hasanno(node.func, 'live_val')
     assert anno.hasanno(node.func, 'fqn')
-    target_obj = anno.getanno(node.func, 'live_val')
+    target_entity = anno.getanno(node.func, 'live_val')
     target_fqn = anno.getanno(node.func, 'fqn')
 
     if not self._should_compile(node, target_fqn):
       return node
 
     if anno.hasanno(node, 'is_constructor'):
-      new_name = self.namer.compiled_class_name(
-          '__'.join(target_fqn), live_object=target_obj)
+      new_name = self.context.namer.compiled_class_name(
+          target_fqn, live_entity=target_entity)
+      do_rename = True
     else:
-      new_name = self.namer.compiled_function_name(
-          '__'.join(target_fqn), live_object=target_obj)
-    node.func = gast.Name(id=new_name, ctx=gast.Load(), annotation=None)
-    return node
-
-  def _rename_member_function_of_known_type(self, node):
-    assert isinstance(node.func, gast.Attribute)
-
-    type_fqn = anno.getanno(node.func, 'type_fqn')
-    assert anno.hasanno(node.func, 'type')
-    target_type = anno.getanno(node.func, 'type')
-
-    if not self._should_compile(node, type_fqn):
-      return node
-
-    # TODO(mdan): We should not assume that the namer only needs the
-    # member function name.
-    method_name = node.func.attr
-    method_object = getattr(target_type, method_name)
-    new_name = self.namer.compiled_function_name(
-        method_name, live_object=method_object, owner_type=target_type)
-    if new_name != node.func.attr:
-      # If a member function call is renamed, then the new function is no
-      # longer bound to the target object. We then refactor the call from:
-      #   foo.bar(...)
-      # to:
-      #   renamed_foo(bar, ...)
-      # TODO(mdan): This risks causing duplication, if target_type is renamed.
-      node.args = [node.func.value] + node.args
+      owner_type = self._determine_function_owner(target_entity)
+      new_name, do_rename = self.context.namer.compiled_function_name(
+          target_fqn, live_entity=target_entity, owner_type=owner_type)
+
+    if do_rename:
+      if target_entity is not None:
+        if tf_inspect.ismethod(target_entity):
+          # The renaming process will transform it into a regular function.
+          # TODO(mdan): Is this complete? How does it work with nested members?
+          node.args = [node.func.value] + node.args
       node.func = gast.Name(new_name, gast.Load(), None)
     return node
 
   def _wrap_to_py_func_no_return(self, node):
     args_scope = anno.getanno(node, 'args_scope')
     # TODO(mdan): Properly handle varargs, kwargs, etc.
-    args = tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used)
-
-    # pylint:disable=undefined-variable,unused-argument,function-redefined
-
-    def template(call, wrapper, args):
-
+    template = """
       def wrapper(args):
         call(args)
         return 1
-
       tf.py_func(wrapper, [args], [tf.int64])
-
-    # pylint:enable=undefined-variable,unused-argument,function-redefined
-
-    wrapper_name = self.namer.compiled_function_name(node.func.id)
+    """
     wrapper_def, call_expr = templates.replace(
         template,
         call=node.func,
-        wrapper=gast.Name(wrapper_name, gast.Load(), None),
-        args=args)
+        wrapper=self.context.namer.compiled_function_name(node.func.id)[0],
+        args=tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used))
     anno.setanno(call_expr.value, 'args_scope', args_scope)
     # TODO(mdan): Rename this annotation to 'graph_ready'
     anno.setanno(wrapper_def, 'skip_processing', True)
 
     return (wrapper_def, call_expr)
 
-  def _function_is_compilable(self, target_obj):
+  def _function_is_compilable(self, target_entity):
     # TODO(mdan): This is just a placeholder. Implement.
-    return not isinstance(target_obj, types.BuiltinFunctionType)
+    return not isinstance(target_entity, types.BuiltinFunctionType)
 
   def visit_Expr(self, node):
     if isinstance(node.value, gast.Call):
       if anno.hasanno(node.value.func, 'live_val'):
-        target_obj = anno.getanno(node.value.func, 'live_val')
-        if not self._function_is_compilable(target_obj):
+        target_entity = anno.getanno(node.value.func, 'live_val')
+        if not self._function_is_compilable(target_entity):
           if anno.hasanno(node.value.func, 'fqn'):
             target_fqn = anno.getanno(node.value.func, 'fqn')
             if not self._should_compile(node.value, target_fqn):
@@ -236,8 +241,8 @@ class CallTreeTransformer(gast.NodeTransformer):
     # If the function is wrapped by one of the marker decorators,
     # consider it graph ready.
     if anno.hasanno(node.func, 'live_val'):
-      target_obj = anno.getanno(node.func, 'live_val')
-      if target_obj in self.nocompile_decorators:
+      target_entity = anno.getanno(node.func, 'live_val')
+      if target_entity in self.nocompile_decorators:
         if len(node.args) < 1:
           raise ValueError(
               'Found call to decorator function "%s", but it had no arguments. '
@@ -246,28 +251,28 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     self.generic_visit(node)
     if anno.hasanno(node.func, 'live_val'):
-      target_obj = anno.getanno(node.func, 'live_val')
-      if self._function_is_compilable(target_obj):
+      target_entity = anno.getanno(node.func, 'live_val')
+      if self._function_is_compilable(target_entity):
         node = self._rename_compilable_function(node)
       else:
         raise NotImplementedError('py_func with return values')
-    elif anno.hasanno(node.func, 'type_fqn'):
-      node = self._rename_member_function_of_known_type(node)
     else:
-      raise NotImplementedError(
-          'Member function call (of unknown type): %s.' % node.func.id)
+      if self.context.recursive:
+        raise NotImplementedError('Could not resolve target function.')
+      else:
+        # TODO(mdan): Double check. Is this reachable code?
+        pass
     return node
 
   # pylint:enable=invalid-name
 
 
-def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators):
+def transform(node, context, uncompiled_modules, nocompile_decorators):
   """Transform function call to the compiled counterparts.
 
   Args:
     node: AST to transform.
-    namer: FunctionNamer-like.
-    namespace: Dict mapping symbol names to their corresponding live objects.
+    context: An EntityContext object.
     uncompiled_modules: set of string tuples, each tuple represents the fully
         qualified name of a package containing functions that will not be
         compiled.
@@ -278,7 +283,6 @@ def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators):
         node: The transformed AST
         new_names: set(string), containing any newly-generated names
   """
-  transformer = CallTreeTransformer(namer, namespace, uncompiled_modules,
-                                    nocompile_decorators)
-  node = transformer.visit(node)
+  t = CallTreeTransformer(context, uncompiled_modules, nocompile_decorators)
+  node = t.visit(node)
   return node
diff --git a/tensorflow/contrib/py2tf/convert/call_trees_test.py b/tensorflow/contrib/py2tf/converters/call_trees_test.py
similarity index 63%
rename from tensorflow/contrib/py2tf/convert/call_trees_test.py
rename to tensorflow/contrib/py2tf/converters/call_trees_test.py
index 78a6b5391014725eafb8846a43d3f79be2bfa598..e63c10de0fed72333a6d571f9b9a4f1cb50b5f1d 100644
--- a/tensorflow/contrib/py2tf/convert/call_trees_test.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import call_trees
+from tensorflow.contrib.py2tf.converters import call_trees
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -31,18 +28,16 @@ from tensorflow.python.platform import test
 
 class TestNamer(call_trees.FunctionNamer):
 
-  def compiled_function_name(self, original_name, live_object=None):
-    return 'renamed_%s' % original_name
+  def compiled_function_name(self,
+                             original_fqn,
+                             live_entity=None,
+                             owner_type=None):
+    if owner_type is not None:
+      return None, False
+    return ('renamed_%s' % '_'.join(original_fqn)), True
 
 
-class CallTreesTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class CallTreesTest(converter_test_base.TestCase):
 
   def test_basic(self):
 
@@ -55,14 +50,35 @@ class CallTreesTest(test.TestCase):
     def test_fn_2(a):
       return test_fn_1(a) + 1
 
-    node = self._parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1})
-    node = call_trees.transform(node, TestNamer(), {}, (), ())
+    node = self.parse_and_analyze(
+        test_fn_2, {'test_fn_1': test_fn_1}, namer=TestNamer())
+    node = call_trees.transform(node, self.ctx, (), ())
     result = compiler.ast_to_object(node)
     # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1 manually.
     setattr(result, 'renamed_test_fn_1', renamed_test_fn_1)
 
     self.assertEquals(3, result.test_fn_2(1))
 
+  def test_simple_methods(self):
+
+    class TestClass(object):
+
+      def test_fn_1(self, a):
+        return a + 1
+
+      def test_fn_2(self, a):
+        return self.test_fn_1(a) + 1
+
+    node = self.parse_and_analyze(
+        TestClass.test_fn_2, {'TestClass': TestClass},
+        namer=TestNamer(),
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    node = call_trees.transform(node, self.ctx, (), ())
+    result = compiler.ast_to_object(node)
+
+    tc = TestClass()
+    self.assertEquals(3, result.test_fn_2(tc, 1))
+
   def test_uncompiled_modules(self):
 
     def test_fn(a):
@@ -70,11 +86,13 @@ class CallTreesTest(test.TestCase):
       a = math_ops.add(a, constant_op.constant(1))
       return a
 
-    node = self._parse_and_analyze(test_fn, {
-        'math_ops': math_ops,
-        'constant_op': constant_op
-    })
-    node = call_trees.transform(node, TestNamer(), {},
+    node = self.parse_and_analyze(
+        test_fn, {
+            'math_ops': math_ops,
+            'constant_op': constant_op
+        },
+        namer=TestNamer())
+    node = call_trees.transform(node, self.ctx,
                                 set(((math_ops.__name__,),
                                      (constant_op.__name__,))), ())
     result = compiler.ast_to_object(node)
diff --git a/tensorflow/contrib/py2tf/convert/continue_canonicalization.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
similarity index 90%
rename from tensorflow/contrib/py2tf/convert/continue_canonicalization.py
rename to tensorflow/contrib/py2tf/converters/continue_canonicalization.py
index 7f8ace77a830ebcc4d49fcf2190e4bac920b1cde..486f0f6509d67d9d981e43ea6e5c77d14e6b23fc 100644
--- a/tensorflow/contrib/py2tf/convert/continue_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
@@ -33,32 +33,28 @@ class ContinueCanonicalizationTransformer(gast.NodeTransformer):
     self.continuation_uses = []
 
   def _create_continuation_check(self):
-
-    def template(var_name):
+    template = """
       if not var_name:
         pass
-
-    cond, = templates.replace(
-        template, var_name=gast.Name(self.continuation_uses[-1][1], None, None))
+    """
+    cond, = templates.replace(template, var_name=self.continuation_uses[-1][1])
     cond.body = []
     return cond
 
   def _create_continuation_trigger(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = True
-
+    """
     assign, = templates.replace(
-        template, var_name=gast.Name(self.continuation_uses[-1][1], None, None))
+        template, var_name=self.continuation_uses[-1][1])
     return assign
 
   def _create_continuation_init(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = False
-
+    """
     assign, = templates.replace(
-        template, var_name=gast.Name(self.continuation_uses[-1][1], None, None))
+        template, var_name=self.continuation_uses[-1][1])
     return assign
 
   def _visit_and_reindent_if_necessary(self, nodes):
diff --git a/tensorflow/contrib/py2tf/convert/continue_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
similarity index 83%
rename from tensorflow/contrib/py2tf/convert/continue_canonicalization_test.py
rename to tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
index a041ff4641fef6c6d5cd7c502d1196dde26c55e0..c1fe903a2dd332626c8e64826652723c30ac412a 100644
--- a/tensorflow/contrib/py2tf/convert/continue_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import continue_canonicalization
-from tensorflow.contrib.py2tf.convert import control_flow
+from tensorflow.contrib.py2tf.converters import continue_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
 
 
@@ -32,12 +31,7 @@ class TestNamer(control_flow.SymbolNamer):
     return name_root
 
 
-class ContinueCanonicalizationTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    return node
+class ContinueCanonicalizationTest(converter_test_base.TestCase):
 
   def test_basic_continue(self):
 
@@ -50,7 +44,7 @@ class ContinueCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = continue_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -71,7 +65,7 @@ class ContinueCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = continue_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -97,7 +91,7 @@ class ContinueCanonicalizationTest(test.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = continue_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
similarity index 64%
rename from tensorflow/contrib/py2tf/convert/control_flow.py
rename to tensorflow/contrib/py2tf/converters/control_flow.py
index 8ebd9ad93dbc17814d1d7f53c3eac2e078030141..a40c7b28f7bc3b8483b0b18cf11dbf99456df645 100644
--- a/tensorflow/contrib/py2tf/convert/control_flow.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow.py
@@ -75,29 +75,6 @@ class ControlFlowTransformer(gast.NodeTransformer):
       raise ValueError(
           'The else branch creates new symbols that the if branch does not.')
 
-    def template(  # pylint:disable=missing-docstring
-        test,
-        body_name,
-        body,
-        orelse_name,
-        orelse,
-        aliased,
-        aliases,  # pylint:disable=unused-argument
-        aliased_results,
-        results):  # pylint:disable=unused-argument
-
-      def body_name():  # pylint:disable=function-redefined
-        aliases, = aliased,  # pylint:disable=unused-variable
-        body  # pylint:disable=pointless-statement
-        return (aliased_results,)
-
-      def orelse_name():  # pylint:disable=function-redefined
-        aliases, = aliased,  # pylint:disable=unused-variable
-        orelse  # pylint:disable=pointless-statement
-        return (aliased_results,)
-
-      results = tf.cond(test, body_name, orelse_name)  # pylint:disable=undefined-variable
-
     all_modified = tuple(body_scope.modified | orelse_scope.modified)
     all_referenced = body_scope.referenced | orelse_scope.referenced
 
@@ -107,10 +84,10 @@ class ControlFlowTransformer(gast.NodeTransformer):
     need_alias = (
         (body_scope.modified | orelse_scope.modified) -
         (body_scope.created | orelse_scope.created))
-    aliased = tuple(need_alias)
-    aliases = tuple(
-        self.namer.new_symbol(s, all_referenced) for s in aliased)
-    alias_map = dict(zip(aliased, aliases))
+    aliased_orig_names = tuple(need_alias)
+    aliased_new_names = tuple(
+        self.namer.new_symbol(s, all_referenced) for s in aliased_orig_names)
+    alias_map = dict(zip(aliased_orig_names, aliased_new_names))
     node_body = node.body
     node_body = [SymbolRenamer(alias_map).visit(n) for n in node_body]
     node_orelse = node.orelse
@@ -122,20 +99,29 @@ class ControlFlowTransformer(gast.NodeTransformer):
       results = gast.Tuple(
           tuple(gast.Name(s, None, None) for s in all_modified), None)
 
+    template = """
+      def body_name():
+        aliased_new_names, = aliased_orig_names,
+        body
+        return (all_results,)
+      def orelse_name():
+        aliased_new_names, = aliased_orig_names,
+        orelse
+        return (all_results,)
+      results = tf.cond(test, body_name, orelse_name)
+    """
+    body_name = self.namer.new_symbol('if_true', all_referenced)
     return templates.replace(
         template,
         test=node.test,
-        body_name=gast.Name(
-            self.namer.new_symbol('if_true', all_referenced), None, None),
+        body_name=body_name,
         body=node_body,
-        orelse_name=gast.Name(
-            self.namer.new_symbol('if_false', all_referenced), None, None),
+        orelse_name=self.namer.new_symbol('if_false', all_referenced),
         orelse=node_orelse,
-        aliased=tuple(gast.Name(s, None, None) for s in aliased),
-        aliases=tuple(gast.Name(s, None, None) for s in aliases),
-        aliased_results=tuple(
-            gast.Name(alias_map[s] if s in aliased else s, None, None)
-            for s in all_modified),
+        aliased_orig_names=tuple(aliased_orig_names),
+        aliased_new_names=tuple(aliased_new_names),
+        all_results=tuple(alias_map[s] if s in aliased_orig_names else s
+                          for s in all_modified),
         results=results)
 
   def visit_While(self, node):
@@ -144,38 +130,28 @@ class ControlFlowTransformer(gast.NodeTransformer):
     body_scope = anno.getanno(node, 'body_scope')
     body_closure = tuple(body_scope.modified - body_scope.created)
 
-    def template(
-        state,  # pylint:disable=unused-argument
-        state_ast_tuple,  # pylint:disable=unused-argument
-        test_name,
-        test,  # pylint:disable=unused-argument
-        body_name,
-        body):
-
-      def test_name(state):  # pylint:disable=function-redefined,unused-argument
-        return test
-
-      def body_name(state):  # pylint:disable=function-redefined,unused-argument
-        body  # pylint:disable=pointless-statement
-        return state,
-
-      state_ast_tuple = tf.while_loop(test_name, body_name, [state])  # pylint:disable=undefined-variable
-
-    test_name = self.namer.new_symbol('loop_test', body_scope.referenced)
-    body_name = self.namer.new_symbol('loop_body', body_scope.referenced)
     if len(body_closure) == 1:
-      state = gast.Name(body_closure[0], None, None)
+      state = body_closure[0]
       state_ast_tuple = state
     else:
-      state = tuple(gast.Name(n, None, None) for n in body_closure)
-      state_ast_tuple = gast.Tuple(state, None)
+      state = tuple(body_closure)
+      state_ast_tuple = gast.Tuple(
+          tuple(gast.Name(n, None, None) for n in state), None)
+    template = """
+      def test_name(state):
+        return test
+      def body_name(state):
+        body
+        return state,
+      state_ast_tuple = tf.while_loop(test_name, body_name, [state])
+    """
     node = templates.replace(
         template,
         state=state,
         state_ast_tuple=state_ast_tuple,
-        test_name=gast.Name(test_name, gast.Load(), None),
+        test_name=self.namer.new_symbol('loop_test', body_scope.referenced),
         test=node.test,
-        body_name=gast.Name(body_name, gast.Load(), None),
+        body_name=self.namer.new_symbol('loop_body', body_scope.referenced),
         body=node.body)
 
     return node
diff --git a/tensorflow/contrib/py2tf/convert/control_flow_test.py b/tensorflow/contrib/py2tf/converters/control_flow_test.py
similarity index 79%
rename from tensorflow/contrib/py2tf/convert/control_flow_test.py
rename to tensorflow/contrib/py2tf/converters/control_flow_test.py
index 64a317ee9cbbd41c3257387492ef5fbfb15a6933..054e33750dbae86559a9575dfecde64132b9a2cd 100644
--- a/tensorflow/contrib/py2tf/convert/control_flow_test.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import control_flow
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
@@ -40,14 +37,7 @@ class TestNamer(control_flow.SymbolNamer):
       i += 1
 
 
-class ControlFlowTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class ControlFlowTest(converter_test_base.TestCase):
 
   def test_simple_while(self):
 
@@ -59,7 +49,7 @@ class ControlFlowTest(test.TestCase):
         i += 1
       return s, i, n
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
@@ -75,7 +65,7 @@ class ControlFlowTest(test.TestCase):
         n -= 1
       return n
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
@@ -94,7 +84,7 @@ class ControlFlowTest(test.TestCase):
         b = 2 * n
       return a, b
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
@@ -112,7 +102,7 @@ class ControlFlowTest(test.TestCase):
         n = -n
       return n
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bfa55443c4fb09bdac4872d2fd2a9aefda74298
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for tests in this module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct.static_analysis import access
+from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.python.platform import test
+
+
+class TestCase(test.TestCase):
+
+  def parse_and_analyze(self,
+                        test_fn,
+                        namespace,
+                        namer=None,
+                        arg_types=None,
+                        include_type_analysis=True,
+                        recursive=True):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=namer,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=recursive)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
+    if include_type_analysis:
+      node = type_info.resolve(node, ctx)
+      node = live_values.resolve(node, ctx, {})
+    self.ctx = ctx
+    return node
diff --git a/tensorflow/contrib/py2tf/convert/decorators.py b/tensorflow/contrib/py2tf/converters/decorators.py
similarity index 75%
rename from tensorflow/contrib/py2tf/convert/decorators.py
rename to tensorflow/contrib/py2tf/converters/decorators.py
index a4313bfa510a81463a218cd21b41d9a7f43d1892..3f620c1cd2d9b75f82410754a7e812e13eabe3ae 100644
--- a/tensorflow/contrib/py2tf/convert/decorators.py
+++ b/tensorflow/contrib/py2tf/converters/decorators.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Handles decorators."""
+"""Handles decorators.
+
+Note: this module only deals with functions whose decorators are still recorded
+in the AST. This does not always happen. See the unit test for an example.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -34,17 +38,19 @@ class DecoratorsTransformer(gast.NodeTransformer):
 
   def visit_FunctionDef(self, node):
     self.generic_visit(node)
+    kept_decorators = []
     for dec in node.decorator_list:
       if isinstance(dec, gast.Call):
-        dec = dec.func
-      if not anno.hasanno(dec, 'live_val'):
+        dec_func = dec.func
+      else:
+        dec_func = dec
+      if not anno.hasanno(dec_func, 'live_val'):
         raise ValueError(
-            'Could not resolve decorator: %s' % pretty_printer.fmt(dec))
-      dec_value = anno.getanno(dec, 'live_val')
-      if dec_value in self.remove_decorators:
-        continue
-      raise ValueError('Dont know how to convert decorators for now.')
-    node.decorator_list = []
+            'Could not resolve decorator: %s' % pretty_printer.fmt(dec_func))
+      dec_value = anno.getanno(dec_func, 'live_val')
+      if dec_value not in self.remove_decorators:
+        kept_decorators.append(dec)
+    node.decorator_list = kept_decorators
     return node
 
   # pylint:enable=invalid-name
diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/py2tf/converters/decorators_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f50d593043aeb76d63beb3cb6c301122c9ed8948
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/decorators_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for decorators module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import decorators
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+class DecoratorsTest(converter_test_base.TestCase):
+
+  def test_function_decorator(self):
+
+    def function_decorator():
+
+      def decorator(f):
+        return lambda a: f(a) + 1
+
+      return decorator
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them on load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @function_decorator()
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'function_decorator': function_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    result = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(function_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(function_decorator,))
+    result = compiler.ast_to_object(node)
+    self.assertEqual(1, result.test_fn(1))
+
+  def test_simple_decorator(self):
+
+    def simple_decorator(f):
+      return lambda a: f(a) + 1
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them upon load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @simple_decorator
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'simple_decorator': simple_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    result = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(simple_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(simple_decorator,))
+    result = compiler.ast_to_object(node)
+    self.assertEqual(1, result.test_fn(1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/convert/for_canonicalization.py b/tensorflow/contrib/py2tf/converters/for_canonicalization.py
similarity index 77%
rename from tensorflow/contrib/py2tf/convert/for_canonicalization.py
rename to tensorflow/contrib/py2tf/converters/for_canonicalization.py
index 52360789cdc25528d925092e3e269c9968f2022f..c284689b904c6f372f30e83c259416a51babe4a6 100644
--- a/tensorflow/contrib/py2tf/convert/for_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/for_canonicalization.py
@@ -42,46 +42,40 @@ class ForLoopCanonicalizationTransformer(gast.NodeTransformer):
     # Or maybe we should replace range with tf.range?
 
     if anno.hasanno(node, 'extra_cond'):
-
-      def template(loop_iter, target, body, i, n, extra_cond):  # pylint:disable=unused-argument
+      template = """
         i = 0
-        n = len(loop_iter)  # pylint:disable=undefined-variable
+        n = len(loop_iter)
         while i < n and extra_cond:
           # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
           target = loop_iter[i]
-          body  # pylint:disable=pointless-statement
+          body
           i += 1
-
+      """
       return templates.replace(
           template,
           loop_iter=node.iter,
           target=node.target,
           body=node.body,
-          i=gast.Name(
-              self.namer.new_symbol('i', body_scope.referenced), None, None),
-          n=gast.Name(
-              self.namer.new_symbol('n', body_scope.referenced), None, None),
+          i=self.namer.new_symbol('i', body_scope.referenced),
+          n=self.namer.new_symbol('n', body_scope.referenced),
           extra_cond=anno.getanno(node, 'extra_cond'))
     else:
-
-      def template(loop_iter, target, body, i, n):  # pylint:disable=unused-argument
+      template = """
         i = 0
-        n = len(loop_iter)  # pylint:disable=undefined-variable
+        n = len(loop_iter)
         while i < n:
           # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
           target = loop_iter[i]
           body  # pylint:disable=pointless-statement
           i += 1
-
+      """
       return templates.replace(
           template,
           loop_iter=node.iter,
           target=node.target,
           body=node.body,
-          i=gast.Name(
-              self.namer.new_symbol('i', body_scope.referenced), None, None),
-          n=gast.Name(
-              self.namer.new_symbol('n', body_scope.referenced), None, None))
+          i=self.namer.new_symbol('i', body_scope.referenced),
+          n=self.namer.new_symbol('n', body_scope.referenced))
 
   def visit_Continue(self, node):
     assert False, 'continue statement should be desugared at this point'
diff --git a/tensorflow/contrib/py2tf/convert/for_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
similarity index 75%
rename from tensorflow/contrib/py2tf/convert/for_canonicalization_test.py
rename to tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
index 8de2d1a0f82cbb2f995a83fcdc1521ebf172e1ce..a6e6350fd45e9c9575af9c12d3d0c4e9b89bee41 100644
--- a/tensorflow/contrib/py2tf/convert/for_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import control_flow
-from tensorflow.contrib.py2tf.convert import for_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import for_canonicalization
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
 
 
@@ -32,12 +31,7 @@ class TestNamer(control_flow.SymbolNamer):
     return name_root
 
 
-class ControlFlowTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    return node
+class ControlFlowTest(converter_test_base.TestCase):
 
   def test_basic_for(self):
 
@@ -47,7 +41,7 @@ class ControlFlowTest(test.TestCase):
         s += e
       return s
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = for_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/logical_expressions.py b/tensorflow/contrib/py2tf/converters/logical_expressions.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/logical_expressions.py
rename to tensorflow/contrib/py2tf/converters/logical_expressions.py
diff --git a/tensorflow/contrib/py2tf/convert/logical_expressions_test.py b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
similarity index 85%
rename from tensorflow/contrib/py2tf/convert/logical_expressions_test.py
rename to tensorflow/contrib/py2tf/converters/logical_expressions_test.py
index f07fa017b9dacd7a998f04fa7f6fdd83fccb1811..d711065099b24ad814104e6460e6ca551b31b3e6 100644
--- a/tensorflow/contrib/py2tf/convert/logical_expressions_test.py
+++ b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
@@ -18,21 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import logical_expressions
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import logical_expressions
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class GradientsFunctionTest(test.TestCase):
+class GradientsFunctionTest(converter_test_base.TestCase):
 
   def test_equals(self):
 
     def test_fn(a, b):
       return a == b
 
-    node = parser.parse_object(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
     node = logical_expressions.transform(node)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', math_ops)
@@ -46,7 +46,7 @@ class GradientsFunctionTest(test.TestCase):
     def test_fn(a, b, c):
       return (a or b) and (a or b or c)
 
-    node = parser.parse_object(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
     node = logical_expressions.transform(node)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', math_ops)
diff --git a/tensorflow/contrib/py2tf/convert/print_functions.py b/tensorflow/contrib/py2tf/converters/print_functions.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/print_functions.py
rename to tensorflow/contrib/py2tf/converters/print_functions.py
diff --git a/tensorflow/contrib/py2tf/convert/print_functions_test.py b/tensorflow/contrib/py2tf/converters/print_functions_test.py
similarity index 65%
rename from tensorflow/contrib/py2tf/convert/print_functions_test.py
rename to tensorflow/contrib/py2tf/converters/print_functions_test.py
index 8b6c238aa4718ecfcd32679ff73c9694ce02e2de..475196ce102955b350acf9bf94255997f875f62c 100644
--- a/tensorflow/contrib/py2tf/convert/print_functions_test.py
+++ b/tensorflow/contrib/py2tf/converters/print_functions_test.py
@@ -20,30 +20,20 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.convert import print_functions
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import print_functions
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.platform import test
 
 
-class PrintFunctionsTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class PrintFunctionsTest(converter_test_base.TestCase):
 
   def test_transform(self):
 
     def test_fn(a):
       print(a)
 
-    node = self._parse_and_analyze(test_fn, {'print': print})
+    node = self.parse_and_analyze(test_fn, {'print': print})
     node = print_functions.transform(node)
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
similarity index 79%
rename from tensorflow/contrib/py2tf/convert/side_effect_guards.py
rename to tensorflow/contrib/py2tf/converters/side_effect_guards.py
index 1f25303fbac1184d016a63d629ba2ecf17d7e426..1eda8ae630aef0f99ca21e134ab61360e6c8dd52 100644
--- a/tensorflow/contrib/py2tf/convert/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -34,6 +34,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from contextlib import contextmanager
+
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
@@ -94,12 +96,11 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
     return node
 
   def _gate_symbols(self, guard_statement, guarded_args):
-
-    def template(args):  # pylint:disable=unused-argument
-      (args,) = (tf.identity(a) for a in (args,))  # pylint:disable=undefined-variable
-
-    guards = templates.replace(
-        template, args=tuple(gast.Name(a, None, None) for a in guarded_args))
+    # TODO(mdan): This won't work for variables.
+    template = """
+      (args,) = (tf.identity(a) for a in (args,))
+    """
+    guards = templates.replace(template, args=tuple(guarded_args))
     guard_statement.body.extend(guards)
     return guard_statement
 
@@ -110,37 +111,22 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
       #   opt.minimize(loss)
       # or:
       #   tf.py_func(...)
-
-      args_scope = anno.getanno(node.value, 'args_scope')
-      temp_name = self.namer.new_symbol('temp', args_scope.parent.referenced)
-      # TODO(mdan): Unsafe reference modification!
-      args_scope.mark_write(temp_name)
-
-      def template(call, temp_result):
-        temp_result = call
-        if temp_result is not None:
-          if not isinstance(temp_result, (list, tuple)):
-            temp_result = (temp_result,)
-          ctx = tf.control_dependencies(temp_result)  # pylint:disable=undefined-variable
-        else:
-          ctx = contextmanager(lambda: (yield))()  # pylint:disable=undefined-variable
-        with ctx:
-          # TODO(mdan): Also insert ops to re-fetch if variables are involved.
+      template = """
+        with py2tf_utils.control_dependency_on_returns(tf, call):
+          # TODO(mdan): Also insert ops to re-fetch if variables are involved?
           pass  # Will be removed below.
-
-      # TODO(mdan): This is brittle. Reorganize this mechanism.
-      statements = templates.replace(
-          template,
-          call=node.value,
-          temp_result=gast.Name(temp_name, None, None))
+      """
+      # TODO(mdan): This is brittle. Reorganize the mechanism.
+      statements = templates.replace(template, call=node.value)
       control_deps_guard = statements[-1]
       control_deps_guard.body = []
 
       # First, attempt to gate future evaluation of args. If that's not
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
-      guarded_args = tuple(
-          n for n in args_scope.used if n in args_scope.parent.modified)
+      args_scope = anno.getanno(node.value, 'args_scope')
+      guarded_args = tuple(args_scope.used & (args_scope.parent.modified
+                                              | args_scope.parent.returned))
       if guarded_args:
         node = tuple(statements[:-1]) + (
             self._gate_symbols(control_deps_guard, guarded_args),)
diff --git a/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
similarity index 72%
rename from tensorflow/contrib/py2tf/convert/side_effect_guards_test.py
rename to tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index 1715e9eb95b3016054ec15a4db141bf84ceba9d8..452d7ab2be8d846be02ef760441896099ab36d53 100644
--- a/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -18,12 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import side_effect_guards
+from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import side_effect_guards
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
@@ -37,14 +35,7 @@ class TestNamer(side_effect_guards.SymbolNamer):
     return name_root
 
 
-class SideEffectGuardsTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class SideEffectGuardsTest(converter_test_base.TestCase):
 
   def test_transform(self):
 
@@ -52,10 +43,11 @@ class SideEffectGuardsTest(test.TestCase):
       state_ops.assign(a, a + 1)
       return a
 
-    node = self._parse_and_analyze(test_fn, {'state_ops': state_ops})
+    node = self.parse_and_analyze(test_fn, {'state_ops': state_ops})
     node = side_effect_guards.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'state_ops', state_ops)
+    setattr(result, 'py2tf_utils', utils)
 
     # TODO(mdan): Configure the namespaces instead of doing these hacks.
     ops.identity = array_ops.identity
diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/py2tf/impl/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..22f0c25cabcd44261c0b42091b50969500db2193
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/BUILD
@@ -0,0 +1,65 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "impl",
+    srcs = [
+        "api.py",
+        "config.py",
+        "conversion.py",
+        "naming.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/contrib/py2tf/converters",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "api_test",
+    srcs = ["api_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "conversion_test",
+    srcs = ["conversion_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "naming_test",
+    srcs = ["naming_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/api.py b/tensorflow/contrib/py2tf/impl/api.py
similarity index 76%
rename from tensorflow/contrib/py2tf/api.py
rename to tensorflow/contrib/py2tf/impl/api.py
index 0fd833e29ca39436edf9062c94b7ab1a04c9ba6d..85d40f31580d156bf719e059bb3580a068595cb5 100644
--- a/tensorflow/contrib/py2tf/api.py
+++ b/tensorflow/contrib/py2tf/impl/api.py
@@ -23,8 +23,8 @@ from functools import wraps
 import gast
 import six
 
-from tensorflow.contrib.py2tf import config
-from tensorflow.contrib.py2tf import conversion
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import conversion
 from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.util import tf_inspect
@@ -83,17 +83,16 @@ def convert_inline(f, *args, **kwargs):
   return convert(arg_value_hints)(f)(*args, **kwargs)
 
 
-def convert(recursive=False, arg_value_hints=None):
+def convert(recursive=False, arg_types=None):
   """Decorator that compiles a function to graph mode.
 
-  The decorator is dynamic - invoking compilation whenever the decorated function
-  is called. This means the parameter values are known at compilation.
+  The decorator is dynamic - invoking compilation whenever the decorated
+  function is called. This means the parameter values are known at compilation.
 
   Args:
     recursive: Whether to recusrively convert any functions that the decorator
         function may call.
-    arg_value_hints: A dict mapping parameter names to objects that can hint
-        at the type of those parameters.
+    arg_types: See to_graph.
 
   Returns:
     A decorator that compiles the given function to graph mode.
@@ -101,8 +100,8 @@ def convert(recursive=False, arg_value_hints=None):
   Raises:
     ValueError: If any of the arguments are illegal.
   """
-  if arg_value_hints is None:
-    arg_value_hints = {}
+  if arg_types is None:
+    arg_types = {}
 
   def decorator(f):
     """Decorator implementation."""
@@ -111,22 +110,23 @@ def convert(recursive=False, arg_value_hints=None):
     def wrapper(*args, **kwargs):
       """Wrapper that calls the compiled version of the wrapped function."""
       partial_types = ()
+      arg_values = {}
       arg_names = tf_inspect.getargspec(f)[0]
       for name, arg in zip(arg_names, args):
+        arg_values[name] = arg
         arg_class = arg.__class__
-        if tf_inspect.isclass(arg_class):
-          # If arg_value_hints specifies any name, use that instead.
-          # TODO(mdan): Shouldn't this just be in the func's globals?
-          if name not in arg_value_hints:
-            arg_value_hints[name] = (arg_class.__name__, arg_class)
+        # If arg_value_hints specifies any name, use that instead.
+        if name not in arg_types:
+          arg_types[name] = (arg_class.__name__, arg_class)
+        if name == 'self' and tf_inspect.isclass(arg_class):
           # Annotated methods need to specify that their owner type is partial,
           # otherwise other members they call will not be converted.
-          if name == 'self':
-            partial_types = (arg_class,)
+          partial_types = (arg_class,)
       wrapped = to_graph(
           f,
           recursive=recursive,
-          arg_value_hints=arg_value_hints,
+          arg_values=arg_values,
+          arg_types=arg_types,
           partial_types=partial_types)
       return wrapped(*args, **kwargs)
 
@@ -138,7 +138,11 @@ def convert(recursive=False, arg_value_hints=None):
   return decorator
 
 
-def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
+def to_graph(e,
+             recursive=True,
+             arg_values=None,
+             arg_types=None,
+             partial_types=None):
   """Compile a Python entity into equivalent TensorFlow code.
 
   Currently supported entities:
@@ -148,11 +152,13 @@ def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
   Classes are handled by converting all their methods into a new class.
 
   Args:
-    o: A Python function or class.
+    e: A Python entity.
     recursive: Whether to recusrively convert any functions that the decorator
         function may call.
-    arg_value_hints: A dict mapping parameter names to objects that can hint
-        at the type of those parameters.
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
+        parameters.
     partial_types: A set of types (e.g. classes) that will not be converted
         entirely. Calls to member functions for these types will be renamed
         independently.
@@ -165,7 +171,7 @@ def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
       recursive=recursive,
       nocompile_decorators=(convert, graph_ready, convert_inline),
       partial_types=partial_types)
-  _, name = conversion.object_to_graph(o, conversion_map, arg_value_hints)
+  _, name = conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
 
   module = gast.Module([])
   for import_line in config.COMPILED_IMPORT_STATEMENTS:
@@ -176,16 +182,17 @@ def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
 
   # The compiled code should see everything the entry function saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
-  if tf_inspect.isfunction(o):
-    compiled_node.__dict__.update(six.get_function_globals(o))
+  if tf_inspect.isfunction(e):
+    compiled_node.__dict__.update(six.get_function_globals(e))
 
   compiled_fn = getattr(compiled_node, name)
   return compiled_fn
 
 
-def to_code(o,
+def to_code(e,
             recursive=True,
-            arg_value_hints=None,
+            arg_values=None,
+            arg_types=None,
             partial_types=None,
             indentation='  '):
   """Return the equivalent of an entity in TensorFlow code.
@@ -193,14 +200,11 @@ def to_code(o,
   See `to_graph` for more details.
 
   Args:
-    o: A Python function or class.
-    recursive: Whether to recusrively convert any functions that the decorator
-        function may call.
-    arg_value_hints: A dict mapping parameter names to objects that can hint
-        at the type of those parameters.
-    partial_types: A set of types (e.g. classes) that will not be converted
-        entirely. Calls to member functions for these types will be renamed
-        independently.
+    e: A Python entity.
+    recursive: See to_graph.
+    arg_values: See to_graph.
+    arg_types: See to_graph.
+    partial_types: See to_graph.
     indentation: String, when to use for each level of indentation.
 
   Returns:
@@ -210,7 +214,7 @@ def to_code(o,
       recursive=recursive,
       nocompile_decorators=(convert, graph_ready, convert_inline),
       partial_types=partial_types)
-  conversion.object_to_graph(o, conversion_map, arg_value_hints)
+  conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
 
   imports = '\n'.join(config.COMPILED_IMPORT_STATEMENTS)
   code = '\n'.join(
diff --git a/tensorflow/contrib/py2tf/api_test.py b/tensorflow/contrib/py2tf/impl/api_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/api_test.py
rename to tensorflow/contrib/py2tf/impl/api_test.py
index 2384447708d7e0ab5dbfbeb592a47353f1909f50..dbd079a3ca6d09824f24c6f0bd7647758d3a5552 100644
--- a/tensorflow/contrib/py2tf/api_test.py
+++ b/tensorflow/contrib/py2tf/impl/api_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import api
-from tensorflow.contrib.py2tf import config
+from tensorflow.contrib.py2tf.impl import api
+from tensorflow.contrib.py2tf.impl import config
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/py2tf/config.py b/tensorflow/contrib/py2tf/impl/config.py
similarity index 83%
rename from tensorflow/contrib/py2tf/config.py
rename to tensorflow/contrib/py2tf/impl/config.py
index 0a9d52136eab494907992db0b6ad0cebcc1985ac..6525806a0933dd9f0a237e278bb70b88346bea27 100644
--- a/tensorflow/contrib/py2tf/config.py
+++ b/tensorflow/contrib/py2tf/impl/config.py
@@ -22,6 +22,7 @@ PYTHON_LITERALS = {
     'None': None,
     'False': False,
     'True': True,
+    'float': float,
 }
 
 DEFAULT_UNCOMPILED_MODULES = set((
@@ -31,7 +32,9 @@ DEFAULT_UNCOMPILED_MODULES = set((
 NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
 # TODO(mdan): Also allow controlling the generated names (for testability).
+# TODO(mdan): Verify that these names are not hidden by generated code.
+# TODO(mdan): Make sure copybara renames the reference below.
 COMPILED_IMPORT_STATEMENTS = (
-    'from contextlib import contextmanager',
     'import tensorflow as tf',
-)
+    'from tensorflow.contrib.py2tf import utils as '
+    'py2tf_utils')
diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/impl/conversion.py
similarity index 56%
rename from tensorflow/contrib/py2tf/conversion.py
rename to tensorflow/contrib/py2tf/impl/conversion.py
index 38f1c0a14a7d9347273702679686e978e0192c03..ed71ff5c063b0ef3a3f78a3f68847c7099c96e90 100644
--- a/tensorflow/contrib/py2tf/conversion.py
+++ b/tensorflow/contrib/py2tf/impl/conversion.py
@@ -21,18 +21,20 @@ from __future__ import print_function
 import gast
 import six
 
-from tensorflow.contrib.py2tf import config
-from tensorflow.contrib.py2tf import naming
-from tensorflow.contrib.py2tf.convert import break_canonicalization
-from tensorflow.contrib.py2tf.convert import builtin_functions
-from tensorflow.contrib.py2tf.convert import call_trees
-from tensorflow.contrib.py2tf.convert import continue_canonicalization
-from tensorflow.contrib.py2tf.convert import control_flow
-from tensorflow.contrib.py2tf.convert import decorators
-from tensorflow.contrib.py2tf.convert import for_canonicalization
-from tensorflow.contrib.py2tf.convert import logical_expressions
-from tensorflow.contrib.py2tf.convert import print_functions
-from tensorflow.contrib.py2tf.convert import side_effect_guards
+from tensorflow.contrib.py2tf.converters import asserts
+from tensorflow.contrib.py2tf.converters import break_canonicalization
+from tensorflow.contrib.py2tf.converters import builtin_functions
+from tensorflow.contrib.py2tf.converters import call_trees
+from tensorflow.contrib.py2tf.converters import continue_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import decorators
+from tensorflow.contrib.py2tf.converters import for_canonicalization
+from tensorflow.contrib.py2tf.converters import logical_expressions
+from tensorflow.contrib.py2tf.converters import print_functions
+from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import naming
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
@@ -51,9 +53,9 @@ class ConversionMap(object):
         function may call.
     nocompile_decorators: tuple of decorator functions that toggle compilation
         off.
-    dependency_cache: dict[object]: ast; maps original objects to their
+    dependency_cache: dict[object]: ast; maps original entities to their
         converted AST
-    name_map: dict[string]: string; maps original objects to the name of
+    name_map: dict[string]: string; maps original entities to the name of
         their converted counterparts
   """
 
@@ -66,8 +68,8 @@ class ConversionMap(object):
     self.dependency_cache = {}
     self.name_map = {}
 
-  def new_namer(self, global_symbols):
-    return naming.Namer(global_symbols, self.recursive, self.name_map,
+  def new_namer(self, namespace):
+    return naming.Namer(namespace, self.recursive, self.name_map,
                         self.partial_types)
 
   def update_name_map(self, namer):
@@ -76,48 +78,47 @@ class ConversionMap(object):
         if self.name_map[o] != name:
           raise ValueError(
               'Calls to %s were converted using multiple names (%s). This is '
-              'possible when an object with one of these names already '
+              'possible when an entity with one of these names already '
               'existed. To fix, avoid using any of these names.')
       else:
         self.name_map[o] = name
 
-  def add_to_cache(self, original_object, converted_ast):
-    self.dependency_cache[original_object] = converted_ast
+  def add_to_cache(self, original_entity, converted_ast):
+    self.dependency_cache[original_entity] = converted_ast
 
 
-def object_to_graph(o, conversion_map, value_hints):
-  """Compile a Python object into equivalent TensorFlow.
+def entity_to_graph(o, conversion_map, arg_values, arg_types):
+  """Compile a Python entity into equivalent TensorFlow.
 
-  The function will also recursively compile all the objects that `o`
+  The function will also recursively compile all the entities that `o`
   references, updating `dependency_cache`.
 
   This function is reentrant, and relies on dependency_cache to avoid
   generating duplicate code.
 
   Args:
-    o: A Python object.
+    o: A Python entity.
     conversion_map: A ConversionMap object.
-    value_hints: A dict containing value hints for symbols like function
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
         parameters.
 
   Returns:
     A tuple (ast, new_name):
-        * ast: An AST representing an object with interface equivalent to `o`,
+        * ast: An AST representing an entity with interface equivalent to `o`,
             but which when executed it creates TF a graph.
-        * new_name: The symbol name under which the new object can be found.
+        * new_name: The symbol name under which the new entity can be found.
 
   Raises:
-    ValueError: if the object is not supported.
+    ValueError: if the entity type is not supported.
   """
-  if value_hints is None:
-    value_hints = {}
-
   if tf_inspect.isclass(o):
-    node, new_name = class_to_graph(o, conversion_map, value_hints)
+    node, new_name = class_to_graph(o, conversion_map)
   elif tf_inspect.isfunction(o):
-    node, new_name = function_to_graph(o, conversion_map, value_hints)
+    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
-    node, new_name = function_to_graph(o, conversion_map, value_hints)
+    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
   else:
     raise ValueError(
         'Entity "%s" has unsupported type "%s". Only functions and classes are '
@@ -132,25 +133,26 @@ def object_to_graph(o, conversion_map, value_hints):
           # Class members are converted with their objects, unless they're
           # only converted partially.
           continue
-        object_to_graph(obj, conversion_map, None)
+        entity_to_graph(obj, conversion_map, {}, {})
 
   return node, new_name
 
 
-def class_to_graph(c, conversion_map, param_value_hints):
-  """Specialization of `object_to_graph` for classes."""
+def class_to_graph(c, conversion_map):
+  """Specialization of `entity_to_graph` for classes."""
   converted_members = {}
   members = tf_inspect.getmembers(c, predicate=tf_inspect.ismethod)
   if not members:
     raise ValueError('Cannot convert %s: it has no member methods.')
 
-  if 'self' in param_value_hints:
-    raise ValueError('Hints may not be provided for reserved name "self".')
-  param_value_hints['self'] = (c.__name__, c)
-
   class_globals = None
   for _, m in members:
-    node, _ = function_to_graph(m, conversion_map, param_value_hints, c)
+    node, _ = function_to_graph(
+        m,
+        conversion_map=conversion_map,
+        arg_values={},
+        arg_types={'self': (c.__name__, c)},
+        owner_type=c)
     # TODO(mdan): Do not assume all members have the same view of globals.
     if class_globals is None:
       class_globals = six.get_function_globals(m)
@@ -167,10 +169,12 @@ def class_to_graph(c, conversion_map, param_value_hints):
   return node, class_name
 
 
-def function_to_graph(f, conversion_map, param_value_hints, owner_type=None):
-  """Specialization of `object_to_graph` for callable functions."""
-  node = parser.parse_object(f).body[0]
-  node_globals = six.get_function_globals(f)
+def function_to_graph(f, conversion_map, arg_values, arg_types,
+                      owner_type=None):
+  """Specialization of `entity_to_graph` for callable functions."""
+  node, source = parser.parse_entity(f)
+  node = node.body[0]
+  namespace = six.get_function_globals(f)
 
   # This is needed for non-global functions.
   closure = six.get_function_closure(f)
@@ -178,53 +182,51 @@ def function_to_graph(f, conversion_map, param_value_hints, owner_type=None):
     for e in closure:
       if callable(e.cell_contents):
         fn = e.cell_contents
-        node_globals[fn.__name__] = fn
-
-  namer = conversion_map.new_namer(node_globals)
-  node = node_to_graph(node, tf_inspect.getsource(f), tf_inspect.getfile(f),
-                       namer, node_globals, param_value_hints,
-                       conversion_map.nocompile_decorators)
-
-  # Simulate a rename to ensure the top level is in the name map. This is needed
-  # for top level functions, and it also helps the consistency verification made
-  # by update_name_map.
-  if owner_type is not None:
-    new_name = namer.compiled_function_name(f.__name__, f, owner_type)
-  else:
-    new_name = namer.compiled_function_name(f.__name__, f)
+        namespace[fn.__name__] = fn
+
+  namer = conversion_map.new_namer(namespace)
+  ctx = context.EntityContext(
+      namer=namer,
+      source_code=source,
+      source_file='<fragment>',
+      namespace=namespace,
+      arg_values=arg_values,
+      arg_types=arg_types,
+      recursive=conversion_map.recursive)
+  node = node_to_graph(node, ctx, conversion_map.nocompile_decorators)
+
+  # TODO(mdan): This somewhat duplicates the call rename logic in call_treest.py
+  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
+  if not did_rename:
+    new_name = f.__name__
+    if node.name != f.__name__:
+      raise NotImplementedError('Strange corner case. Send us offending code!')
+
   node.name = new_name
   conversion_map.update_name_map(namer)
-  return node, conversion_map.name_map[f]
+  return node, new_name
 
 
-def _static_analysis_pass(node, source, f, namespace, value_hints):
-  node = access.resolve(node)
-  node = live_values.resolve(node, namespace, config.PYTHON_LITERALS)
-  node = type_info.resolve(node, source, f, value_hints)
+def _static_analysis_pass(node, ctx):
+  node = access.resolve(node, ctx)
+  node = live_values.resolve(node, ctx, config.PYTHON_LITERALS)
+  node = type_info.resolve(node, ctx)
   return node
 
 
-def node_to_graph(node, source, f, namer, namespace, value_hints,
-                  nocompile_decorators):
+def node_to_graph(node, ctx, nocompile_decorators):
   """Convert Python code to equivalent TF graph mode code.
 
   Args:
     node: A Python AST node representing the code to convert.
-    source: Optional string containing the source code of the node. Used in
-        error messages.
-    f: Optional string indicating the file where the node originated. None if
-        unknown. Used in error messages.
-    namer: A naming.Namer object.
-    namespace: Dict mapping symbol names to their corresponding live objects.
-    value_hints: A dict containing value hints for symbols like function
-        parameters.
+    ctx: An EntityContext object.
     nocompile_decorators: A tuple containing decorators to be stripped from
         functions during conversion.
 
   Returns:
     A tuple (node, deps):
         * node: A Python ast node, representing the converted code.
-        * deps: A set of strings, the fully qualified names of object
+        * deps: A set of strings, the fully qualified names of entity
             dependencies that this node has.
   """
   # TODO(mdan): Verify arguments for correctness.
@@ -241,30 +243,30 @@ def node_to_graph(node, source, f, namer, namespace, value_hints,
   # tree, which must be accounted. Although less efficient, it is most robust
   # to re-run the analysis.
 
-  node = _static_analysis_pass(node, source, f, namespace, value_hints)
+  node = _static_analysis_pass(node, ctx)
   node = decorators.transform(node, nocompile_decorators)
-  node = break_canonicalization.transform(node, namer)
+  node = break_canonicalization.transform(node, ctx.namer)
+  node = asserts.transform(node, ctx)
 
   # Note: sequencing continue canonicalization before for loop one avoids
   # dealing with the extra loop increment operation that the for
   # canonicalization creates.
-  node = continue_canonicalization.transform(node, namer)
-  namespace['len'] = len
+  node = continue_canonicalization.transform(node, ctx.namer)
+  ctx.namespace['len'] = len
 
-  node = _static_analysis_pass(node, None, None, namespace, value_hints)
-  node = for_canonicalization.transform(node, namer)
+  node = _static_analysis_pass(node, ctx)
+  node = for_canonicalization.transform(node, ctx.namer)
   # for_canonicalization may insert new global references.
   node = builtin_functions.transform(node)
   # builtin_functions may insert new global references.
-  namespace['print'] = print
+  ctx.namespace['print'] = print
 
-  node = _static_analysis_pass(node, None, None, namespace, value_hints)
+  node = _static_analysis_pass(node, ctx)
   node = print_functions.transform(node)
-  node = call_trees.transform(node, namer, namespace,
-                              config.DEFAULT_UNCOMPILED_MODULES,
+  node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES,
                               nocompile_decorators)
-  node = control_flow.transform(node, namer)
+  node = control_flow.transform(node, ctx.namer)
   node = logical_expressions.transform(node)
-  node = side_effect_guards.transform(node, namer)
+  node = side_effect_guards.transform(node, ctx.namer)
 
   return node
diff --git a/tensorflow/contrib/py2tf/conversion_test.py b/tensorflow/contrib/py2tf/impl/conversion_test.py
similarity index 78%
rename from tensorflow/contrib/py2tf/conversion_test.py
rename to tensorflow/contrib/py2tf/impl/conversion_test.py
index e48bfe44640c3c06b5a952384ca8098310dac159..3888958f19b9fa13b759924c5188722e500e30a1 100644
--- a/tensorflow/contrib/py2tf/conversion_test.py
+++ b/tensorflow/contrib/py2tf/impl/conversion_test.py
@@ -20,26 +20,29 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf import conversion
+from tensorflow.contrib.py2tf.impl import conversion
 from tensorflow.python.platform import test
 
 
 class ConversionTest(test.TestCase):
 
-  def test_object_to_graph_unsupported_types(self):
+  def test_entity_to_graph_unsupported_types(self):
     with self.assertRaises(ValueError):
-      conversion.object_to_graph('dummy', None, {})
+      conversion_map = conversion.ConversionMap(True, (), ())
+      conversion.entity_to_graph('dummy', conversion_map, None, None)
+
+  def test_entity_to_graph_callable(self):
 
-  def test_object_to_graph_callable(self):
     def f(a):
       return a
 
     conversion_map = conversion.ConversionMap(True, (), ())
-    ast, new_name = conversion.object_to_graph(f, conversion_map, {})
+    ast, new_name = conversion.entity_to_graph(f, conversion_map, None, None)
     self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
     self.assertEqual('tf__f', new_name)
 
-  def test_object_to_graph_call_tree(self):
+  def test_entity_to_graph_call_tree(self):
+
     def g(a):
       return a
 
@@ -47,7 +50,7 @@ class ConversionTest(test.TestCase):
       return g(a)
 
     conversion_map = conversion.ConversionMap(True, (), ())
-    conversion.object_to_graph(f, conversion_map, {})
+    conversion.entity_to_graph(f, conversion_map, None, None)
 
     self.assertTrue(f in conversion_map.dependency_cache)
     self.assertTrue(g in conversion_map.dependency_cache)
diff --git a/tensorflow/contrib/py2tf/naming.py b/tensorflow/contrib/py2tf/impl/naming.py
similarity index 66%
rename from tensorflow/contrib/py2tf/naming.py
rename to tensorflow/contrib/py2tf/impl/naming.py
index a90758962b83e1616f7d727440eb7481c49343ad..5c7e4c5f95ac5a23061e82bd74fb992bab55af11 100644
--- a/tensorflow/contrib/py2tf/naming.py
+++ b/tensorflow/contrib/py2tf/impl/naming.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util import tf_inspect
-
 
 class Namer(object):
   """Implementation of the namer interfaces required by various converters.
@@ -45,10 +43,15 @@ class Namer(object):
 
     self.generated_names = set()
 
-  def compiled_class_name(self, original_name, live_object=None):
+  def compiled_class_name(self, original_fqn, live_entity=None):
     """See call_trees.FunctionNamer.compiled_class_name."""
-    if live_object is not None and live_object in self.renamed_calls:
-      return self.renamed_calls[live_object]
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity]
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
+    else:
+      original_name = original_fqn
 
     new_name_root = 'Tf%s' % original_name
     new_name = new_name_root
@@ -57,41 +60,46 @@ class Namer(object):
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
 
-    if live_object is not None:
-      self.renamed_calls[live_object] = new_name
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     return new_name
 
   def compiled_function_name(self,
-                             original_name,
-                             live_object=None,
+                             original_fqn,
+                             live_entity=None,
                              owner_type=None):
     """See call_trees.FunctionNamer.compiled_function_name."""
-    if live_object is not None and live_object in self.renamed_calls:
-      return self.renamed_calls[live_object]
 
     if not self.recursive:
-      new_name = original_name
-    elif owner_type is None or owner_type in self.partial_types:
-      # Top level functions: rename
-      new_name_root = 'tf__%s' % original_name
-      new_name = new_name_root
-      n = 0
-      while new_name in self.global_namespace:
-        n += 1
-        new_name = '%s_%d' % (new_name_root, n)
+      return None, False
+
+    if owner_type is not None and owner_type not in self.partial_types:
+      # Members are not renamed when part of an entire converted class.
+      return None, False
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
     else:
-      if tf_inspect.isclass(owner_type):
-        # Class members: do not rename (the entire class will be renamed)
-        new_name = original_name
-      else:
-        raise NotImplementedError('Member function "%s" of non-class type: %s' %
-                                  (original_name, owner_type))
-
-    if live_object is not None:
-      self.renamed_calls[live_object] = new_name
+      original_name = original_fqn
+
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity], True
+
+    new_name_root = 'tf__%s' % original_name
+    new_name = new_name_root
+    n = 0
+    while new_name in self.global_namespace:
+      n += 1
+      new_name = '%s_%d' % (new_name_root, n)
+
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
-    return new_name
+
+    return new_name, True
 
   def new_symbol(self, name_root, reserved_locals):
     """See control_flow.SymbolNamer.new_symbol."""
diff --git a/tensorflow/contrib/py2tf/naming_test.py b/tensorflow/contrib/py2tf/impl/naming_test.py
similarity index 82%
rename from tensorflow/contrib/py2tf/naming_test.py
rename to tensorflow/contrib/py2tf/impl/naming_test.py
index 7bfc9b8733b6efc3ab440ae5a0614258ae395ad4..beb4e54937bbb91b19157c9b9e3c528353206c62 100644
--- a/tensorflow/contrib/py2tf/naming_test.py
+++ b/tensorflow/contrib/py2tf/impl/naming_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import naming
+from tensorflow.contrib.py2tf.impl import naming
 from tensorflow.python.platform import test
 
 
@@ -29,8 +29,9 @@ class NamerTest(test.TestCase):
       pass
 
     namer = naming.Namer({}, True, None, ())
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo'))
-    self.assertEqual('tf__bar', namer.compiled_function_name('bar', bar))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name('foo'))
+    self.assertEqual(('tf__bar', True), namer.compiled_function_name(
+        'bar', bar))
     self.assertEqual({bar: 'tf__bar'}, namer.renamed_calls)
     self.assertItemsEqual(('tf__bar', 'tf__foo'), namer.generated_names)
 
@@ -39,15 +40,18 @@ class NamerTest(test.TestCase):
       pass
 
     namer = naming.Namer({}, True, None, ())
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo))
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
 
   def test_compiled_function_name_avoids_global_conflicts(self):
     def foo():
       pass
 
     namer = naming.Namer({'tf__foo': 1}, True, None, ())
-    self.assertEqual('tf__foo_1', namer.compiled_function_name('foo', foo))
+    self.assertEqual(('tf__foo_1', True),
+                     namer.compiled_function_name('foo', foo))
 
   def test_new_symbol_tracks_names(self):
     namer = naming.Namer({}, True, None, ())
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index 9dd564cb9fda7f9c8f99344ddb41321fbb527575..1b2408ba0ea37e596b70d0f824bc04167fc9f412 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -20,6 +20,7 @@ py_library(
         "__init__.py",
         "anno.py",
         "compiler.py",
+        "context.py",
         "parser.py",
         "pretty_printer.py",
         "templates.py",
@@ -30,6 +31,7 @@ py_library(
     deps = [
         "@astor_archive//:astor",
         "@gast_archive//:gast",
+        "@six_archive//:six",
         "@termcolor_archive//:termcolor",
     ],
 )
@@ -37,6 +39,7 @@ py_library(
 py_test(
     name = "anno_test",
     srcs = ["anno_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -46,6 +49,7 @@ py_test(
 py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -56,6 +60,7 @@ py_test(
 py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -65,6 +70,7 @@ py_test(
 py_test(
     name = "pretty_printer_test",
     srcs = ["pretty_printer_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -74,6 +80,7 @@ py_test(
 py_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/py2tf/pyct/compiler.py
index b09353cc72bd5f9d02a8973ebe880b92d39ac304..fc71469d1eaeb92352e3b50cb743621d7e5eb1d5 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler.py
+++ b/tensorflow/contrib/py2tf/pyct/compiler.py
@@ -41,7 +41,7 @@ def ast_to_source(node, indentation):
   return astor.source_repr.pretty_source(generator.result).lstrip()
 
 
-def ast_to_object(node, indentation='  '):
+def ast_to_object(node, indentation='  ', source_prefix=None):
   """Return the Python objects represented by given AST.
 
   Compiling the AST code this way ensures that the source code is readable by
@@ -50,6 +50,7 @@ def ast_to_object(node, indentation='  '):
   Args:
     node: The code to compile, as an AST object.
     indentation: The string to use for indentation.
+    source_prefix: Optional string to print as-is into the source file.
 
   Returns:
     A module object containing the compiled source code.
@@ -58,5 +59,8 @@ def ast_to_object(node, indentation='  '):
 
   with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
     module_name = os.path.basename(f.name[:-3])
+    if source_prefix:
+      f.write(source_prefix)
+      f.write('\n')
     f.write(source)
   return imp.load_source(module_name, f.name)
diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/py2tf/pyct/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..fef74ebefa290369c7310af6d7e4faeef44d9aee
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/context.py
@@ -0,0 +1,43 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conversion context containers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class EntityContext(object):
+  """Contains information about an entity, like source code.
+
+  Attributes:
+    namer: Namer that matches the contract of all converters.
+    source_code: The entity's source code.
+    source_file: The entity's source file.
+    namespace: Dict[str->*], containing symbols visible to the entity
+        (excluding parameters).
+    arg_values: Dict[str->*], containing parameter values, if known.
+    arg_types: Dict[str->*], containing parameter types, if known.
+  """
+
+  def __init__(self, namer, source_code, source_file, namespace, arg_values,
+               arg_types, recursive):
+    self.namer = namer
+    self.source_code = source_code
+    self.source_file = source_file
+    self.namespace = namespace
+    self.arg_values = {} if arg_values is None else arg_values
+    self.arg_types = {} if arg_types is None else arg_types
+    self.recursive = recursive
diff --git a/tensorflow/contrib/py2tf/pyct/parser.py b/tensorflow/contrib/py2tf/pyct/parser.py
index 3daa69b9ceff714c94c61134f6fb81f9927ea258..dc7df883b349becd860bb0dbceab22cb39c750b5 100644
--- a/tensorflow/contrib/py2tf/pyct/parser.py
+++ b/tensorflow/contrib/py2tf/pyct/parser.py
@@ -28,11 +28,13 @@ import gast
 from tensorflow.python.util import tf_inspect
 
 
-def parse_object(obj):
-  """Return the AST of given object."""
-  return parse_str(tf_inspect.getsource(obj))
+def parse_entity(entity):
+  """Return the AST of given entity."""
+  source = tf_inspect.getsource(entity)
+  source = textwrap.dedent(source)
+  return parse_str(source), source
 
 
 def parse_str(src):
   """Return the AST of given piece of code."""
-  return gast.parse(textwrap.dedent(src))
+  return gast.parse(src)
diff --git a/tensorflow/contrib/py2tf/pyct/parser_test.py b/tensorflow/contrib/py2tf/pyct/parser_test.py
index 46f9aa82071efa98518810851b76761ff42751e5..f35dfa04c70dc191078248c32f9a04d28133129a 100644
--- a/tensorflow/contrib/py2tf/pyct/parser_test.py
+++ b/tensorflow/contrib/py2tf/pyct/parser_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import textwrap
+
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.platform import test
 
@@ -28,15 +30,16 @@ def f(x):
 
 class ParserTest(test.TestCase):
 
-  def test_parse_object(self):
-    mod = parser.parse_object(f)
+  def test_parse_entity(self):
+    mod, _ = parser.parse_entity(f)
     self.assertEqual('f', mod.body[0].name)
 
   def test_parse_str(self):
-    mod = parser.parse_str("""
+    mod = parser.parse_str(
+        textwrap.dedent("""
         def f(x):
           return x + 1
-    """)
+    """))
     self.assertEqual('f', mod.body[0].name)
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
index abaf9536781efadea61b0da684020baeeed0597d..32e2954fffca3b9f512116648117904b85a60e25 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@@ -32,6 +32,7 @@ py_library(
 py_test(
     name = "access_test",
     srcs = ["access_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -43,6 +44,7 @@ py_test(
 py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -53,6 +55,7 @@ py_test(
 py_test(
     name = "type_info_test",
     srcs = ["type_info_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
index 8f3ac48b68c05256fbac4c4d8d86381755c8027c..33629f87d1d9fe32fac5dd05ba5feaecf4be7230 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
@@ -23,6 +23,7 @@ import copy
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
 
 # TODO(mdan): Add support for PY3 (e.g. Param vs arg).
 
@@ -53,6 +54,8 @@ class Scope(object):
     self.modified = set()
     self.created = set()
     self.used = set()
+    self.params = set()
+    self.returned = set()
 
   # TODO(mdan): Rename to `locals`
   @property
@@ -69,42 +72,73 @@ class Scope(object):
     self.modified = copy.copy(other.modified)
     self.created = copy.copy(other.created)
     self.used = copy.copy(other.used)
+    self.params = copy.copy(other.params)
+    self.returned = copy.copy(other.returned)
 
   def merge_from(self, other):
     self.modified |= other.modified
     self.created |= other.created
     self.used |= other.used
+    self.params |= other.params
+    self.returned |= other.returned
 
   def has(self, name):
-    if name in self.modified:
+    if name in self.modified or name in self.params:
       return True
     elif self.parent is not None:
       return self.parent.has(name)
     return False
 
+  def is_modified_since_entry(self, name):
+    if name in self.modified:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_modified_since_entry(name)
+    return False
+
+  def is_param(self, name):
+    if name in self.params:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_param(name)
+    return False
+
   def mark_read(self, name):
     self.used.add(name)
     if self.parent is not None and name not in self.created:
       self.parent.mark_read(name)
 
+  def mark_param(self, name):
+    self.params.add(name)
+
+  def mark_creation(self, name):
+    self.created.add(name)
+
   def mark_write(self, name):
     self.modified.add(name)
     if self.isolated:
-      self.created.add(name)
+      self.mark_creation(name)
     else:
       if self.parent is None:
-        self.created.add(name)
+        self.mark_creation(name)
       else:
         if not self.parent.has(name):
-          self.created.add(name)
+          self.mark_creation(name)
         self.parent.mark_write(name)
 
+  def mark_returned(self, name):
+    self.returned.add(name)
+    if not self.isolated and self.parent is not None:
+      self.parent.mark_returned(name)
+
 
-class AccessResolver(gast.NodeTransformer):
+class AccessResolver(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
-  def __init__(self):
+  def __init__(self, context):
+    super(AccessResolver, self).__init__(context)
     self.scope = Scope(None)
+    self._in_return_statement = False
 
   def visit_Name(self, node):
     # TODO(mdan): This is insufficient for object fields, e.g. hp.learning_rate.
@@ -120,10 +154,17 @@ class AccessResolver(gast.NodeTransformer):
       # TODO(mdan): This bay be incorrect with nested functions.
       # For nested functions, we'll have to add the notion of hiding args from
       # the parent scope, not writing to them.
-      self.scope.mark_write(node.id)
+      self.scope.mark_creation(node.id)
+      self.scope.mark_param(node.id)
     else:
       raise ValueError('Unknown context %s for node %s.' % (type(node.ctx),
                                                             node.id))
+    anno.setanno(node, 'is_modified_since_entry',
+                 self.scope.is_modified_since_entry(node.id))
+    anno.setanno(node, 'is_param', self.scope.is_param(node.id))
+
+    if self._in_return_statement:
+      self.scope.mark_returned(node.id)
     return node
 
   def visit_Print(self, node):
@@ -138,7 +179,7 @@ class AccessResolver(gast.NodeTransformer):
 
   def visit_Call(self, node):
     current_scope = self.scope
-    args_scope = Scope(current_scope)
+    args_scope = Scope(current_scope, isolated=False)
     self.scope = args_scope
     for n in node.args:
       self.visit(n)
@@ -200,6 +241,12 @@ class AccessResolver(gast.NodeTransformer):
         node, ((node.body, 'body'), (node.orelse, 'orelse')))
     return node
 
+  def visit_Return(self, node):
+    self._in_return_statement = True
+    node = self.generic_visit(node)
+    self._in_return_statement = False
+    return node
+
 
-def resolve(node):
-  return AccessResolver().visit(node)
+def resolve(node, context):
+  return AccessResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
index 0912ebb4c355c2ae2563e13e36926a4b8e3599a1..df0283b54d534443c0d4f044894903a4cb375f88 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
@@ -95,6 +96,19 @@ class ScopeTest(test.TestCase):
 
 class AccessResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self, test_fn):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace={},
+        arg_values=None,
+        arg_types=None,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    return node
+
   def test_local_markers(self):
 
     def test_fn(a):  # pylint:disable=unused-argument
@@ -103,9 +117,7 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     self.assertFalse(anno.getanno(node.body[0].body[0].value,
                                   'is_local'))  # c in b = c
     self.assertTrue(anno.getanno(node.body[0].body[1].test.left,
@@ -126,9 +138,7 @@ class AccessResolverTest(test.TestCase):
       print(a, b)
       return c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     print_node = node.body[0].body[2]
     if isinstance(print_node, gast.Print):
       # Python 2
@@ -151,9 +161,7 @@ class AccessResolverTest(test.TestCase):
       foo(a, b)  # pylint:disable=undefined-variable
       return c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[2].value
     # We basically need to detect which variables are captured by the call
     # arguments.
@@ -169,15 +177,13 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b, c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     while_node = node.body[0].body[1]
     self.assertScopeIs(
         anno.getanno(while_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
     self.assertScopeIs(
         anno.getanno(while_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('a', 'b', 'c'), ('a', 'b', 'c'))
+        ('b', 'c'), ('a', 'b', 'c'))
 
   def test_for(self):
 
@@ -188,15 +194,13 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b, c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     for_node = node.body[0].body[1]
     self.assertScopeIs(
         anno.getanno(for_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
     self.assertScopeIs(
         anno.getanno(for_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('a', 'b', 'c', '_'), ('a', 'b', 'c', '_'))
+        ('b', 'c', '_'), ('a', 'b', 'c', '_'))
 
   def test_if(self):
 
@@ -211,9 +215,7 @@ class AccessResolverTest(test.TestCase):
         u = -y
       return z, u
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
     self.assertScopeIs(
         anno.getanno(if_node, 'body_scope'), ('x', 'y'), ('x', 'y', 'z'),
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
index 242e544b5286c683ee4aa97bc586751932c73815..5a2903e6b5909e64e7bf2c9ea733a391da9292a1 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
@@ -26,26 +26,19 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
 
 
-class LiveValueResolver(gast.NodeTransformer):
+class LiveValueResolver(transformer.Base):
   """Annotates nodes with live values."""
 
-  def __init__(self, namespace, literals):
-    """Create a new resolver.
-
-    Args:
-      namespace: A dict representing the namespace visible to the AST in the
-          intended execution context.
-      literals: A dict mapping literal lymbol names to their value. An example
-          literal is "None".
-    """
-    self.namespace = namespace
+  def __init__(self, context, literals):
+    super(LiveValueResolver, self).__init__(context)
     self.literals = literals
 
   def visit_ClassDef(self, node):
     self.generic_visit(node)
-    anno.setanno(node, 'live_val', self.namespace[node.name])
+    anno.setanno(node, 'live_val', self.context.namespace[node.name])
     return node
 
   def visit_Name(self, node):
@@ -53,20 +46,31 @@ class LiveValueResolver(gast.NodeTransformer):
     if isinstance(node.ctx, gast.Load):
       assert anno.hasanno(node, 'is_local'), node
       symbol_is_local = anno.getanno(node, 'is_local')
-      if not symbol_is_local:
+      assert anno.hasanno(node, 'is_modified_since_entry'), node
+      symbol_is_modified = anno.getanno(node, 'is_modified_since_entry')
+      assert anno.hasanno(node, 'is_param'), node
+      symbol_is_param = anno.getanno(node, 'is_param')
+
+      if not symbol_is_local and not symbol_is_param:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
           # TODO(mdan): Could live values have FQNs? i.e. 'a'.join()
-        elif node.id in self.namespace:
-          obj = self.namespace[node.id]
+        elif node.id in self.context.namespace:
+          obj = self.context.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
           anno.setanno(node, 'fqn', (obj.__name__,))
         else:
-          raise ValueError('Could not find global symbol %s.' % node.id)
+          raise ValueError('Could not resolve symbol "%s".' % node.id)
       else:
         pass
         # TODO(mdan): Attempt to trace its value through the local chain.
         # TODO(mdan): Use type annotations as fallback.
+
+      if not symbol_is_modified:
+        if node.id in self.context.arg_values:
+          obj = self.context.arg_values[node.id]
+          anno.setanno(node, 'live_val', obj)
+          anno.setanno(node, 'fqn', (obj.__class__.__name__,))
     return node
 
   def visit_Attribute(self, node):
@@ -79,15 +83,25 @@ class LiveValueResolver(gast.NodeTransformer):
                                                          node.attr))
       anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
       anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,))
+    # TODO(mdan): Investigate the role built-in annotations can play here.
+    elif anno.hasanno(node.value, 'type'):
+      parent_type = anno.getanno(node.value, 'type')
+      if hasattr(parent_type, node.attr):
+        # This should hold for static members like methods.
+        # This would not hold for dynamic members like function attributes.
+        # For the dynamic case, we simply leave the node without an annotation,
+        # and let downstream consumers figure out what to do.
+        anno.setanno(node, 'live_val', getattr(parent_type, node.attr))
+        anno.setanno(node, 'fqn',
+                     anno.getanno(node.value, 'type_fqn') + (node.attr,))
     elif isinstance(node.value, gast.Name):
       stem_name = node.value
       # All nonlocal symbols should be fully resolved.
       assert anno.hasanno(stem_name, 'is_local'), stem_name
-      assert anno.getanno(stem_name, 'is_local'), stem_name
       # TODO(mdan): Figure out what to do when calling attribute on local object
       # Maybe just leave as-is?
     return node
 
 
-def resolve(node, namespace, literals):
-  return LiveValueResolver(namespace, literals).visit(node)
+def resolve(node, context, literals):
+  return LiveValueResolver(context, literals).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
index e77497654a0b3096422deef9a3f008eeb6c6be05..f3057b346677ac7fc8d36c1c14663c614f7e4ea9 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
@@ -19,24 +19,45 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
 
 class LiveValuesResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self,
+                         test_fn,
+                         namespace,
+                         literals=None,
+                         arg_types=None):
+    literals = literals or {}
+    arg_types = arg_types or {}
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    return node
+
   def test_literals(self):
 
     def test_fn():
       return Foo  # pylint: disable=undefined-variable
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {}, {'Foo': 'bar'})
-
+    node = self._parse_and_analyze(test_fn, {}, {'Foo': 'bar'})
     retval_node = node.body[0].body[0].value
     self.assertEquals('bar', anno.getanno(retval_node, 'live_val'))
 
@@ -48,10 +69,7 @@ class LiveValuesResolverTest(test.TestCase):
     def test_fn():
       return foo()
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'foo': foo}, {})
-
+    node = self._parse_and_analyze(test_fn, {'foo': foo})
     func_node = node.body[0].body[0].value.func
     self.assertEquals(foo, anno.getanno(func_node, 'live_val'))
     self.assertEquals(('foo',), anno.getanno(func_node, 'fqn'))
@@ -61,15 +79,29 @@ class LiveValuesResolverTest(test.TestCase):
     def test_fn():
       return constant_op.constant(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'constant_op': constant_op}, {})
-
+    node = self._parse_and_analyze(test_fn, {'constant_op': constant_op})
     func_node = node.body[0].body[0].value.func
     self.assertEquals(constant_op.constant, anno.getanno(func_node, 'live_val'))
     self.assertEquals((constant_op.__name__, 'constant'),
                       anno.getanno(func_node, 'fqn'))
 
+  def test_attributes_with_type_hints(self):
+
+    class TestClass(object):
+
+      def member(self):
+        pass
+
+      def test_fn(self):
+        return self.member()
+
+    node = self._parse_and_analyze(
+        TestClass.test_fn, {'constant_op': constant_op},
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    func_node = node.body[0].body[0].value.func
+    self.assertEquals(TestClass.member, anno.getanno(func_node, 'live_val'))
+    self.assertEquals(('TestClass', 'member'), anno.getanno(func_node, 'fqn'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
index b17af5d8446026c62c0dd7d4efc7ee6e61ceecb2..cf74142cbe9b1396585ac3e3c6d9858013f05ac2 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
@@ -36,8 +36,6 @@ class Scope(object):
         most recently assigned to the symbol.
   """
 
-  # TODO(mdan): Should rather use a CFG here?
-
   def __init__(self, parent):
     """Create a new scope.
 
@@ -78,10 +76,9 @@ class TypeInfoResolver(transformer.Base):
     * Attribute (helps resolve object methods)
   """
 
-  def __init__(self, value_hints, source, f):
-    super(TypeInfoResolver, self).__init__(source, f)
+  def __init__(self, context):
+    super(TypeInfoResolver, self).__init__(context)
     self.scope = Scope(None)
-    self.value_hints = value_hints
     self.function_level = 0
 
   def visit_FunctionDef(self, node):
@@ -118,20 +115,32 @@ class TypeInfoResolver(transformer.Base):
     node.orelse = self._visit_block(node.orelse)
     return node
 
+  def _process_function_arg(self, arg_name):
+    if self.function_level == 1 and arg_name in self.context.arg_types:
+      # Forge a node to hold the type information, so that method calls on
+      # it can resolve the type.
+      type_holder = gast.Name(arg_name, gast.Load(), None)
+      type_string, type_obj = self.context.arg_types[arg_name]
+      anno.setanno(type_holder, 'type', type_obj)
+      anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
+      self.scope.setval(arg_name, type_holder)
+
+  def visit_arg(self, node):
+    self._process_function_arg(node.arg)
+    return node
+
   def visit_Name(self, node):
     self.generic_visit(node)
     if isinstance(node.ctx, gast.Param):
-      self.scope.setval(node.id, gast.Name(node.id, gast.Load(), None))
-      # TODO(mdan): Member functions should not need type hints.
-      # We could attemp to extract im_class from the live_val annotation.
-      if self.function_level == 1 and node.id in self.value_hints:
-        # Forge a node to hold the type information, so that method calls on
-        # it can resolve the type.
-        type_holder = gast.Name(node.id, gast.Load(), None)
-        type_string, type_obj = self.value_hints[node.id]
-        anno.setanno(type_holder, 'type', type_obj)
-        anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
-        self.scope.setval(node.id, type_holder)
+      self._process_function_arg(node.id)
+    elif isinstance(node.ctx, gast.Load) and self.scope.hasval(node.id):
+      # E.g. if we had
+      # a = b
+      # then for future references to `a` we should have traced_source = `b`
+      traced_source = self.scope.getval(node.id)
+      if anno.hasanno(traced_source, 'type'):
+        anno.setanno(node, 'type', anno.getanno(traced_source, 'type'))
+        anno.setanno(node, 'type_fqn', anno.getanno(traced_source, 'type_fqn'))
     return node
 
   def _process_variable_assignment(self, source, targets):
@@ -175,39 +184,6 @@ class TypeInfoResolver(transformer.Base):
     self._process_variable_assignment(node.value, node.targets)
     return node
 
-  def visit_Call(self, node):
-    target = node.func
-    if not anno.hasanno(target, 'live_val'):
-      if not isinstance(target, gast.Attribute):
-        # Suspecting this pattern would reach here:
-        #   foo = bar
-        #   foo()
-        raise ValueError('Dont know how to handle dynamic functions.')
-      if not isinstance(target.value, gast.Name):
-        # Possible example of this kind:
-        #   foo = module.Foo()
-        #   foo.bar.baz()
-        # TODO(mdan): This should be doable by using the FQN.
-        raise ValueError('Dont know how to handle object properties yet.')
-      # In the example below, object_source is 'tr.train.Optimizer()':
-      #   opt = tf.train.Optimizer()
-      #   opt.foo()
-      if self.scope.hasval(target.value.id):
-        object_source = self.scope.getval(target.value.id)
-        if not anno.hasanno(object_source, 'type'):
-          raise ValueError('Could not determine type of "%s". Is it dynamic?' %
-                           (target.value.id))
-        anno.setanno(target, 'type', anno.getanno(object_source, 'type'))
-        anno.setanno(target, 'type_fqn', anno.getanno(object_source,
-                                                      'type_fqn'))
-      else:
-        # TODO(mdan): Figure out what could the user do to get past this.
-        raise ValueError('No info on "%s". Is it dynamically built?' %
-                         (target.value.id))
-    self.generic_visit(node)
-    return node
-
 
-def resolve(node, source, f, value_hints):
-  assert value_hints is not None
-  return TypeInfoResolver(value_hints, source, f).visit(node)
+def resolve(node, context):
+  return TypeInfoResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
index 98dc7bf50f9b8375ca0390c91bd6e90c245857ee..68fa1ee92a72b2c3d015b84724e63a17069c196c 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
@@ -55,37 +55,45 @@ class ScopeTest(test.TestCase):
 
 class TypeInfoResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
+    return node
+
   def test_constructor_detection(self):
 
     def test_fn():
       opt = training.GradientDescentOptimizer(0.1)
       return opt
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(node, None, None, {})
-
+    node = self._parse_and_analyze(test_fn, {'training': training})
     call_node = node.body[0].body[0].value
     self.assertEquals(training.GradientDescentOptimizer,
                       anno.getanno(call_node, 'type'))
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
                       anno.getanno(call_node, 'type_fqn'))
 
-  def test_class_members(self):
+  def test_class_members_of_detected_constructor(self):
 
     def test_fn():
       opt = training.GradientDescentOptimizer(0.1)
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(node, None, None, {})
-
-    attr_call_node = node.body[0].body[1].value.func
-    self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
-                      anno.getanno(attr_call_node, 'type_fqn'))
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
 
   def test_class_members_in_with_stmt(self):
 
@@ -93,21 +101,17 @@ class TypeInfoResolverTest(test.TestCase):
       with session.Session() as sess:
         sess.run(x)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'session': session}, {})
-    node = type_info.resolve(node, None, None, {})
-
+    node = self._parse_and_analyze(test_fn, {'session': session})
     constructor_call = node.body[0].body[0].items[0].context_expr
     self.assertEquals(session.Session, anno.getanno(constructor_call, 'type'))
     self.assertEquals((session.__name__, 'Session'),
                       anno.getanno(constructor_call, 'type_fqn'))
 
-    member_call = node.body[0].body[0].body[0].value.func
-    self.assertEquals((session.__name__, 'Session'),
-                      anno.getanno(member_call, 'type_fqn'))
+    method_call = node.body[0].body[0].body[0].value.func
+    self.assertEquals(session.Session.run, anno.getanno(method_call,
+                                                        'live_val'))
 
-  def test_constructor_deta_dependent(self):
+  def test_constructor_data_dependent(self):
 
     def test_fn(x):
       if x > 0:
@@ -116,41 +120,34 @@ class TypeInfoResolverTest(test.TestCase):
         opt = training.GradientDescentOptimizer(0.01)
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members(self):
 
     def test_fn(opt):
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+    node = self._parse_and_analyze(test_fn, {})
+    method_call = node.body[0].body[0].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members_with_value_hints(self):
 
     def test_fn(opt):
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(
-        node, None, None, {
-            'opt': (('%s.GradientDescentOptimizer' % training.__name__),
-                    training.GradientDescentOptimizer(0.1))
+    node = self._parse_and_analyze(
+        test_fn, {'training': training},
+        arg_types={
+            'opt': (training.GradientDescentOptimizer.__name__,
+                    training.GradientDescentOptimizer)
         })
 
-    attr_call_node = node.body[0].body[0].value.func
-    self.assertEquals(
-        tuple(training.__name__.split('.')) + ('GradientDescentOptimizer',),
-        anno.getanno(attr_call_node, 'type_fqn'))
+    method_call = node.body[0].body[0].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
 
   def test_function_variables(self):
 
@@ -161,11 +158,9 @@ class TypeInfoResolverTest(test.TestCase):
       foo = bar
       foo()
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'bar': bar}, {})
-    with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+    node = self._parse_and_analyze(test_fn, {'bar': bar})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_nested_members(self):
 
@@ -173,11 +168,9 @@ class TypeInfoResolverTest(test.TestCase):
       foo = training.GradientDescentOptimizer(0.1)
       foo.bar.baz()
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
index 4fadc793e6d1dfa8ddabea1d607de68ac6ad9c85..6be526f20ddfd5c2e847287bfbf11e640e2026e6 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 import ast
 import copy
+import textwrap
 
 import gast
 
@@ -80,37 +81,46 @@ class ReplaceTransformer(gast.NodeTransformer):
       return node
 
 
+def _strings_to_names(n):
+  if isinstance(n, str):
+    # Note: the node will receive the ctx value from the template, see
+    # ReplaceTransformer.visit_Name.
+    return gast.Name(id=n, ctx=None, annotation=None)
+  if isinstance(n, list):
+    return [_strings_to_names(e) for e in n]
+  if isinstance(n, tuple):
+    return tuple(_strings_to_names(e) for e in n)
+  return n
+
+
 def replace(template, **replacements):
   """Replace placeholders in a Python template.
 
+  AST Name and Tuple nodes always receive the context that inferred from
+  the template. However, when replacing more complex nodes (that can potentially
+  contain Name children), then the caller is responsible for setting the
+  appropriate context.
+
   Args:
-    template: A function to be used as a template. Any placeholder is expected
-        to also be a function argument.
+    template: A string representing Python code. Any symbol name can be used
+        that appears in the template code can be used as placeholder.
     **replacements: A mapping from placeholder names to (lists of) AST nodes
-        that these placeholders will be replaced by.
+        that these placeholders will be replaced by. String values are also
+        supported as a shorthand for AST Name nodes with the respective ID.
 
   Returns:
-    body: An AST node or list of AST nodes with the replacements made. If the
-        template was a function, a list will be returned. If the template was a
-        node, the same node will be returned. If the template was a string, an
-        AST node will be returned (a `Module` node in the case of a multi-line
-        string, an `Expr` node otherwise).
+    An AST node or list of AST nodes with the replacements made. If the
+    template was a function, a list will be returned. If the template was a
+    node, the same node will be returned. If the template was a string, an
+    AST node will be returned (a `Module` node in the case of a multi-line
+    string, an `Expr` node otherwise).
 
   Raises:
-    ValueError: If a function is used as a template and an incorrect set of
-        replacements was passed.
+    ValueError: if the arguments are incorrect.
   """
-  tree = parser.parse_object(template).body[0]
-  placeholders = set(arg.id for arg in tree.args.args)
-  tree.args.args = []
-  if tree.args.vararg:
-    placeholders.add(tree.args.vararg)
-    tree.args.vararg = None
-  if set(replacements.keys()) != placeholders:
-    raise ValueError(
-        'too many or few replacements. replacements: %s; placeholders: %s' %
-        (replacements.keys(), placeholders))
-
-  # Perform the replacement, stripping the function into which the template was
-  # wrapped.
+  if not isinstance(template, str):
+    raise ValueError('Expected string template, got %s' % type(template))
+  tree = parser.parse_str(textwrap.dedent(template))
+  for k in replacements:
+    replacements[k] = _strings_to_names(replacements[k])
   return ReplaceTransformer(replacements).visit(tree).body
diff --git a/tensorflow/contrib/py2tf/pyct/templates_test.py b/tensorflow/contrib/py2tf/pyct/templates_test.py
index 2ad8b9317b67c7ae18a16efac745138e14101e6a..1143131283cd92c42abfc73d5728fac96cc31c23 100644
--- a/tensorflow/contrib/py2tf/pyct/templates_test.py
+++ b/tensorflow/contrib/py2tf/pyct/templates_test.py
@@ -28,46 +28,42 @@ from tensorflow.python.platform import test
 class TemplatesTest(test.TestCase):
 
   def test_replace_variable(self):
-    def template(a):  # pylint:disable=unused-argument
-      def test_fn(a):  # pylint:disable=unused-variable
+    template = """
+      def test_fn(a):
         a += 1
         a = 2 * a + 1
-        return b  # pylint:disable=undefined-variable
+        return b
+    """
 
-    node = templates.replace(
-        template, a=gast.Name('b', gast.Load(), None))[0]
+    node = templates.replace(template, a='b')[0]
     result = compiler.ast_to_object(node)
     self.assertEquals(7, result.test_fn(2))
 
   def test_replace_function_name(self):
-    def template(fname):  # pylint:disable=unused-argument
-      def fname(a):  # pylint:disable=function-redefined
+    template = """
+      def fname(a):
         a += 1
         a = 2 * a + 1
         return a
+    """
 
-    node = templates.replace(
-        template, fname=gast.Name('test_fn', gast.Load(), None))[0]
+    node = templates.replace(template, fname='test_fn')[0]
     result = compiler.ast_to_object(node)
     self.assertEquals(7, result.test_fn(2))
 
   def test_code_block(self):
-    def template(block):  # pylint:disable=unused-argument
-      def test_fn(a):  # pylint:disable=unused-variable
-        block  # pylint:disable=pointless-statement
+    template = """
+      def test_fn(a):
+        block
         return a
+    """
 
     node = templates.replace(
         template,
         block=[
-            gast.Assign(
-                [
-                    gast.Name('a', gast.Store(), None)
-                ],
-                gast.BinOp(
-                    gast.Name('a', gast.Load(), None),
-                    gast.Add(),
-                    gast.Num(1))),
+            gast.Assign([
+                gast.Name('a', None, None)
+            ], gast.BinOp(gast.Name('a', None, None), gast.Add(), gast.Num(1))),
         ] * 2)[0]
     result = compiler.ast_to_object(node)
     self.assertEquals(3, result.test_fn(1))
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
index 1658a1b69456cfda3828e51c52a4063a5bf77037..8a836b7c1bf1dfdcfda84588be8fd904d970579e 100644
--- a/tensorflow/contrib/py2tf/pyct/transformer.py
+++ b/tensorflow/contrib/py2tf/pyct/transformer.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
 import gast
+import six
 
 from tensorflow.contrib.py2tf.pyct import pretty_printer
 
@@ -30,23 +33,33 @@ class PyFlowParseError(SyntaxError):
 class Base(gast.NodeTransformer):
   """Base class for specialized transformers."""
 
-  def __init__(self, source, f):
+  def __init__(self, context):
+    """Initialize the transformer. Subclasses should call this.
+
+    Args:
+      context: An EntityContext.
+    """
     self._lineno = 0
     self._col_offset = 0
-    self._source = source
-    self._file = f
+    self.context = context
 
   def visit(self, node):
     try:
-      if self._source and hasattr(node, 'lineno'):
+      source_code = self.context.source_code
+      source_file = self.context.source_file
+      if source_code and hasattr(node, 'lineno'):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
       return super(Base, self).visit(node)
-    except ValueError as e:
-      msg = '%s\nOccurred at node:\n%s' % (str(e), pretty_printer.fmt(node))
-      if self._source:
-        line = self._source.splitlines()[self._lineno - 1]
+    except (ValueError, AttributeError, NotImplementedError) as e:
+      msg = '%s: %s\nOccurred at node:\n%s' % (e.__class__.__name__, str(e),
+                                               pretty_printer.fmt(node))
+      if source_code:
+        line = source_code.splitlines()[self._lineno - 1]
       else:
         line = '<no source available>'
-      raise PyFlowParseError(
-          msg, (self._file, self._lineno, self._col_offset + 1, line))
+      six.reraise(PyFlowParseError,
+                  PyFlowParseError(
+                      msg,
+                      (source_file, self._lineno, self._col_offset + 1, line)),
+                  sys.exc_info()[2])
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..01804aa8834f23851dbc7af3ae9082645639ffbc
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -0,0 +1,37 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "utils",
+    srcs = [
+        "__init__.py",
+        "context_managers.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+    ],
+)
+
+py_test(
+    name = "context_managers_test",
+    srcs = ["context_managers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bca33e89e99cd5939683ad10a2eb17db243af2ef
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility module that contains APIs usable in the generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/py2tf/utils/context_managers.py
new file mode 100644
index 0000000000000000000000000000000000000000..47d98399971da039a8987ea17039c8b44bfa3b61
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various context managers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+def control_dependency_on_returns(tf, return_value):
+  """Create a TF control dependency on the return values of a function.
+
+  If the function had no return value, a no-op context is returned.
+
+  Args:
+    tf: The TensorFlow module.
+    return_value: The return value to set as control dependency.
+
+  Returns:
+    A context manager.
+  """
+  if return_value is None:
+    return contextlib.contextmanager(lambda: (yield))()
+  # TODO(mdan): Filter to tensor objects.
+  if not isinstance(return_value, (list, tuple)):
+    return_value = (return_value,)
+  return tf.control_dependencies(return_value)
diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/py2tf/utils/context_managers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c903f082528118aba2d0163b25a38178b99a17e7
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers_test.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for context_managers module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils import context_managers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class ContextManagersTest(test.TestCase):
+
+  def test_control_dependency_on_returns(self):
+    # Just dry run them.
+    with context_managers.control_dependency_on_returns(ops, None):
+      pass
+    with context_managers.control_dependency_on_returns(
+        ops, constant_op.constant(1)):
+      pass
+    with context_managers.control_dependency_on_returns(
+        ops, [constant_op.constant(1),
+              constant_op.constant(2)]):
+      pass
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 3c5b34a0a6adb2f4e340a8e378c1eb51a2e2b534..b7d525a1fa203fd150642c18304759e1a9c48c4b 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -77,9 +77,13 @@ py_library(
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
     ],
 )
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index aa605e6caadf4d1e69a4a331b1e580797e4fdef8..8ec5334a39397330dda6b576302ffeb0009ce389 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import re
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib.quantize.python import common
@@ -26,14 +25,16 @@ from tensorflow.contrib.quantize.python import input_to_ops
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
-def FoldBatchNorms(graph):
+def FoldBatchNorms(graph, freeze_batch_norm_delay=None, is_training=True):
   """Finds batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -41,15 +42,25 @@ def FoldBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization. This value
+    is used only when is_training is True.
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When batch norm folding fails.
   """
-  _FoldFusedBatchNorms(graph)
-  _FoldUnfusedBatchNorms(graph)
+  _FoldFusedBatchNorms(
+      graph,
+      freeze_batch_norm_delay=freeze_batch_norm_delay,
+      is_training=is_training)
+  _FoldUnfusedBatchNorms(
+      graph,
+      freeze_batch_norm_delay=freeze_batch_norm_delay,
+      is_training=is_training)
 
 
-def _FoldFusedBatchNorms(graph):
+def _FoldFusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
   """Finds fused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -57,6 +68,9 @@ def _FoldFusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -67,8 +81,7 @@ def _FoldFusedBatchNorms(graph):
     # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
     # named `scope`. Otherwise, TF creates a unique scope whose name starts with
     # `scope`.
-    with graph.as_default(), graph.name_scope(scope + sep), ops.device(
-        match.bn_op.device):
+    with graph.as_default(), graph.name_scope(scope + sep):
       with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
         # new weights = old weights * gamma / sqrt(variance + epsilon)
         # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
@@ -79,9 +92,18 @@ def _FoldFusedBatchNorms(graph):
             match.mean_tensor * multiplier_tensor,
             name='bias')
 
+        correction_scale, correction_recip, correction_offset = None, None, None
+        if is_training:
+          correction_scale, correction_recip, correction_offset = (
+              _ComputeBatchNormCorrections(
+                  context='',
+                  match=match,
+                  freeze_batch_norm_delay=freeze_batch_norm_delay,
+                  fused_batch_norm=True))
         # The shape of depthwise weights is different, so we need to reshape the
         # multiplier_tensor to ensure that the scaled_weight_tensor has the
         # expected shape.
+        weights = match.weight_tensor
         if match.layer_op.type == 'DepthwiseConv2dNative':
           new_shape = [
               match.weight_tensor.get_shape().as_list()[2],
@@ -90,15 +112,29 @@ def _FoldFusedBatchNorms(graph):
           multiplier_tensor = array_ops.reshape(
               multiplier_tensor, new_shape, name='scale_reshape')
 
+          if correction_scale is not None:
+            correction_scale = array_ops.reshape(
+                correction_scale, new_shape, name='correction_reshape')
+
+      if correction_scale is not None:
+        weights = math_ops.multiply(
+            correction_scale, weights, name='correction_mult')
+
       # TODO(suharshs): This naming of the following ops needs to carefully
       # follow the naming expected by quantize.py. Generalize the quantize code
       # to not require these delicate naming conventions.
       scaled_weight_tensor = math_ops.multiply(
-          match.weight_tensor, multiplier_tensor, name='mul_fold')
+          weights, multiplier_tensor, name='mul_fold')
 
       new_layer_tensor = _CloneWithNewOperands(
           match.layer_op, match.input_tensor, scaled_weight_tensor)
 
+      if correction_recip is not None:
+        new_layer_tensor = math_ops.multiply(
+            correction_recip, new_layer_tensor, name='post_conv_mul')
+        new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset),
+                                        'correction_add')
+
       bias_add_tensor = math_ops.add(
           new_layer_tensor, bias_tensor, name='add_fold')
 
@@ -165,6 +201,8 @@ def _FindFusedBatchNorms(graph):
   mean_pattern = graph_matcher.OpTypePattern('*')
   variance_pattern = graph_matcher.OpTypePattern('*')
 
+  moving_average_pattern = graph_matcher.OpTypePattern('*')
+  bn_decay_pattern = graph_matcher.OpTypePattern('*')
   conv_pattern = graph_matcher.OpTypePattern(
       'Conv2D|DepthwiseConv2dNative', inputs=[input_pattern, weight_pattern])
   # MatMul has a Reshape between it and FusedBatchNorm.
@@ -180,6 +218,11 @@ def _FindFusedBatchNorms(graph):
           conv_pattern, gamma_pattern, beta_pattern, mean_pattern,
           variance_pattern
       ])
+  conv_moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, conv_batch_norm_pattern])
+  # TODO(suharshs): Use a OneofPattern here when available
+  conv_moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[conv_moving_average_sub_pattern, bn_decay_pattern])
   matmul_batch_norm_pattern = graph_matcher.OpTypePattern(
       'FusedBatchNorm',
       inputs=[
@@ -191,8 +234,34 @@ def _FindFusedBatchNorms(graph):
       inputs=[matmul_batch_norm_pattern,
               graph_matcher.OpTypePattern('*')])
 
+  matmul_moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, matmul_batch_norm_pattern])
+  matmul_moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[matmul_moving_average_sub_pattern, bn_decay_pattern])
+
   conv_matcher = graph_matcher.GraphMatcher(conv_batch_norm_pattern)
   matmul_matcher = graph_matcher.GraphMatcher(matmul_bn_output_reshape_pattern)
+  conv_moving_average_mul_matcher = graph_matcher.GraphMatcher(
+      conv_moving_average_mul_pattern)
+  matmul_moving_average_mul_matcher = graph_matcher.GraphMatcher(
+      matmul_moving_average_mul_pattern)
+
+  def _GetMovingAverageTensors(graph, moving_avg_mul_matcher,
+                               moving_avg_sub_pattern, bn_op):
+    """Gets the moving mean and variance tensors and the batch norm momentum."""
+    for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
+      sub_op = mul_match_result.get_op(moving_avg_sub_pattern)
+
+      if sub_op.inputs[1].name == bn_op.outputs[1].name:
+        # During training: Batch Mean is bn_op.outputs[1]
+        moving_mean_tensor = sub_op.inputs[0]
+        bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+      if sub_op.inputs[1].name == bn_op.outputs[2].name:
+        # During training: Batch Var is bn_op.outputs[2]
+        moving_variance_tensor = sub_op.inputs[0]
+        bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+    return (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+            bn_decay_var_tensor)
 
   def _GetCommonTensors(match_result, bn_op, bn_input_tensor):
     """Gets tensors needed for FusedBatchNormMatch from match_result."""
@@ -222,10 +291,14 @@ def _FindFusedBatchNorms(graph):
       # calculation, the variance is corrected by the term N/N-1 (Bessel's
       # correction). The variance tensor read from FuseBatchNorm has bessel's
       # correction applied, so we undo it here.
-      n = math_ops.cast(
-          array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor),
-          dtypes.float32)
-      variance_tensor = bn_op.outputs[2] * (n - 1) / n
+      scope, sep, _ = bn_op.name.rpartition('/')
+      g = ops.get_default_graph()
+      with g.as_default(), g.name_scope(scope + sep):
+        n = math_ops.cast(
+            array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor),
+            dtypes.float32)
+        variance_tensor = math_ops.multiply(
+            bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction')
     else:
       mean_tensor = match_result.get_tensor(mean_pattern)
       variance_tensor = match_result.get_tensor(variance_pattern)
@@ -233,15 +306,30 @@ def _FindFusedBatchNorms(graph):
             variance_tensor)
 
   for match_result in conv_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
     layer_op = match_result.get_op(conv_pattern)
     layer_tensor = match_result.get_tensor(conv_pattern)
     bn_op = match_result.get_op(conv_batch_norm_pattern)
-    # In the case of convolution the output_tensor is the output of bn_op.
-    output_tensor = bn_op.outputs[0]
+    if bn_op.get_attr('is_training'):
+      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+       bn_decay_var_tensor) = _GetMovingAverageTensors(
+           graph,
+           moving_avg_mul_matcher=conv_moving_average_mul_matcher,
+           moving_avg_sub_pattern=conv_moving_average_sub_pattern,
+           bn_op=bn_op)
 
+    output_tensor = bn_op.outputs[0]
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
     (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-     variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor)
-    yield _FusedBatchNormMatch(
+     variance_tensor) = _GetCommonTensors(
+         match_result,
+         bn_op,
+         layer_tensor,
+     )
+    yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -250,20 +338,38 @@ def _FindFusedBatchNorms(graph):
         gamma_tensor=gamma_tensor,
         beta_tensor=beta_tensor,
         mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
+        variance_tensor=variance_tensor,
+        moving_mean_tensor=moving_mean_tensor,
+        moving_variance_tensor=moving_variance_tensor,
+        bn_decay_mean_tensor=bn_decay_mean_tensor,
+        bn_decay_var_tensor=bn_decay_var_tensor,
+        batch_epsilon_tensor=batch_epsilon_tensor)
 
   for match_result in matmul_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
     layer_op = match_result.get_op(matmul_pattern)
     layer_tensor = match_result.get_tensor(matmul_pattern)
     bn_op = match_result.get_op(matmul_batch_norm_pattern)
+    if bn_op.get_attr('is_training'):
+      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+       bn_decay_var_tensor) = _GetMovingAverageTensors(
+           graph,
+           moving_avg_mul_matcher=matmul_moving_average_mul_matcher,
+           moving_avg_sub_pattern=matmul_moving_average_sub_pattern,
+           bn_op=bn_op)
+
     # In the MatMul case, the output of batch norm is reshaped back into a
     # 2D tensor, so the output_tensor is the output of the Reshape op.
     output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
     output_tensor = output_reshape_op.outputs[0]
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
 
     (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
      variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor)
-    yield _FusedBatchNormMatch(
+    yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -272,15 +378,21 @@ def _FindFusedBatchNorms(graph):
         gamma_tensor=gamma_tensor,
         beta_tensor=beta_tensor,
         mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
+        variance_tensor=variance_tensor,
+        moving_mean_tensor=moving_mean_tensor,
+        moving_variance_tensor=moving_variance_tensor,
+        bn_decay_mean_tensor=bn_decay_mean_tensor,
+        bn_decay_var_tensor=bn_decay_var_tensor,
+        batch_epsilon_tensor=batch_epsilon_tensor)
 
 
-class _FusedBatchNormMatch(object):
-  """Contains all information related to a found FusedBatchNorm."""
+class _BatchNormMatch(object):
+  """Contains all information related to a found Fused/UnfusedBatchNorm."""
 
   def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
                weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-               variance_tensor):
+               variance_tensor, moving_mean_tensor, moving_variance_tensor,
+               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor):
     self._layer_op = layer_op
     self._bn_op = bn_op
     self._output_tensor = output_tensor
@@ -290,6 +402,11 @@ class _FusedBatchNormMatch(object):
     self._beta_tensor = beta_tensor
     self._mean_tensor = mean_tensor
     self._variance_tensor = variance_tensor
+    self._moving_mean_tensor = moving_mean_tensor
+    self._moving_variance_tensor = moving_variance_tensor
+    self._bn_decay_mean_tensor = bn_decay_mean_tensor
+    self._bn_decay_var_tensor = bn_decay_var_tensor
+    self._batch_epsilon_tensor = batch_epsilon_tensor
 
   @property
   def layer_op(self):
@@ -327,8 +444,28 @@ class _FusedBatchNormMatch(object):
   def variance_tensor(self):
     return self._variance_tensor
 
+  @property
+  def moving_mean_tensor(self):
+    return self._moving_mean_tensor
+
+  @property
+  def moving_variance_tensor(self):
+    return self._moving_variance_tensor
+
+  @property
+  def batch_epsilon_tensor(self):
+    return self._batch_epsilon_tensor
+
+  @property
+  def bn_decay_mean_tensor(self):
+    return self._bn_decay_mean_tensor
+
+  @property
+  def bn_decay_var_tensor(self):
+    return self._bn_decay_var_tensor
+
 
-def _FoldUnfusedBatchNorms(graph):
+def _FoldUnfusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
   """Finds unfused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -336,6 +473,9 @@ def _FoldUnfusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, True if training
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -346,7 +486,12 @@ def _FoldUnfusedBatchNorms(graph):
     has_scaling = _HasScaling(graph, input_to_ops_map, bn)
 
     # The mangling code intimately depends on BatchNorm node's internals.
-    original_op, folded_op = _CreateFoldedOp(graph, bn, has_scaling=has_scaling)
+    original_op, folded_op = _CreateFoldedOp(
+        graph,
+        bn,
+        has_scaling=has_scaling,
+        freeze_batch_norm_delay=freeze_batch_norm_delay,
+        is_training=is_training)
 
     activation = common.GetEndpointActivationOp(graph, bn)
     if activation:
@@ -407,7 +552,186 @@ def _HasScaling(graph, input_to_ops_map, bn):
   return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
 
 
-def _CreateFoldedOp(graph, context, has_scaling):
+def _GetBatchNormParams(graph, context, has_scaling):
+  """Extracts relevant tensors for folding batch norms.
+
+  Args:
+    graph: Graph to inspect.
+    context: The scope under which we look for batch norm params
+    has_scaling: Bool that specifies if scaling is done as part of batch
+    norm
+
+  Returns:
+   _BatchNormMatch containing all required batch norm parameters
+  """
+  gamma_tensor = None
+  batch_mean_tensor = None
+  batch_variance_tensor = None
+  moving_mean_tensor = None
+  moving_variance_tensor = None
+  batch_epsilon_tensor = None
+  bn_decay_mean_tensor = None
+  bn_decay_var_tensor = None
+
+  split_context = context.split('/')
+  base_context = split_context[-1]
+
+  oplist = graph.get_operations()
+  op_suffix_gamma = base_context + '/BatchNorm/gamma'
+  op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze'
+  op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1'
+  op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read'
+  op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read'
+  op_suffix_epsilon = base_context + '/BatchNorm/batchnorm/add/y'
+  op_suffix_bn_decay_mean = base_context + '/BatchNorm/AssignMovingAvg/decay'
+  op_suffix_bn_decay_var = base_context + '/BatchNorm/AssignMovingAvg_1/decay'
+
+  # Parse through list of ops to find relevant ops
+  for op in oplist:
+    if op.name.endswith(op_suffix_mean):
+      # This is an efficient way to check for two things:
+      # Is batch norm present and is it training mode?
+      # Batch statistics are computed only during batch norm in training
+      batch_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_variance):
+      batch_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_mean):
+      moving_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_variance):
+      moving_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_epsilon):
+      batch_epsilon_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_mean):
+      bn_decay_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_var):
+      bn_decay_var_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if has_scaling:
+      if op.name.endswith(op_suffix_gamma):
+        gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
+
+  if not has_scaling:
+    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+
+  return _BatchNormMatch(
+      layer_op=None,
+      bn_op=None,
+      output_tensor=None,
+      input_tensor=None,
+      weight_tensor=None,
+      gamma_tensor=gamma_tensor,
+      beta_tensor=None,
+      mean_tensor=batch_mean_tensor,
+      variance_tensor=batch_variance_tensor,
+      moving_mean_tensor=moving_mean_tensor,
+      moving_variance_tensor=moving_variance_tensor,
+      bn_decay_mean_tensor=bn_decay_mean_tensor,
+      bn_decay_var_tensor=bn_decay_var_tensor,
+      batch_epsilon_tensor=batch_epsilon_tensor)
+
+
+def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
+                                 fused_batch_norm):
+  """Computes batch norm correction params.
+
+     Before batch normalization is frozen:
+     We use batch statistics for batch norm.
+       correction_scale = sigma_b/sigma_mv
+       correction_recip = 1/correction_scale
+       correction_offset = 0
+
+     After batch normalization is frozen:
+      correction_scale = sigma_b/sigma_mv
+      correction_recip = 1
+      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).
+
+     Batch norm is frozen if global_step > bn_freeze_delay.
+     The corrections ensure that:
+     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
+     smoother training as the scaling on the weights changes slowly, rather than
+     jump across mini-batches
+     b) Changing the values of the corrections allows for one to switch between
+     using batch statistics to using moving mean and average, without requiring
+     changes to batch_norm
+
+
+  Args:
+    context: The scope under which we look for batch norm params
+    match: Object containg required batch norm tensors for correction
+      computation
+    freeze_batch_norm_delay: Delay in steps at which computation switches
+      from regular batch norm to frozen mean and variance.
+    fused_batch_norm: Bool, true if fused batch norm is used
+
+  Returns:
+    A tuple of correction_scale, correction_recip, correction_offset
+  """
+
+  g = ops.get_default_graph()
+  with g.name_scope(context + 'batch_norm_correction'):
+    recip_sigma_mv = math_ops.rsqrt(
+        match.moving_variance_tensor + match.batch_epsilon_tensor)
+    recip_sigma = math_ops.rsqrt(
+        match.variance_tensor + match.batch_epsilon_tensor)
+    correction_scale = math_ops.divide(
+        recip_sigma_mv, recip_sigma, name='scale_compute')
+    correction_scale = array_ops.identity(
+        correction_scale, name='correction_scale')
+    correction_recip = math_ops.reciprocal(
+        correction_scale, name='reciprocal_compute')
+    correction_offset = math_ops.multiply(
+        match.gamma_tensor,
+        match.mean_tensor * recip_sigma -
+        match.moving_mean_tensor * recip_sigma_mv,
+        name='offset_compute')
+
+    if freeze_batch_norm_delay is not None:
+      use_mv_avg = math_ops.greater_equal(
+          training_util.get_or_create_global_step(),
+          freeze_batch_norm_delay,
+          name='use_moving_average')
+    else:
+      use_mv_avg = False
+
+    bn_decay_zero = 0.0
+    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
+    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())
+
+    bn_decay_mean_out = utils.smart_cond(
+        use_mv_avg,
+        lambda: bn_decay_zero,
+        lambda: match.bn_decay_mean_tensor,
+        name='freeze_moving_mean')
+    graph_editor.reroute_ts(
+        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
+        can_modify=bn_decay_mean_consumers)
+
+    if fused_batch_norm is False:
+      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
+      bn_decay_var_out = utils.smart_cond(
+          use_mv_avg,
+          lambda: bn_decay_zero,
+          lambda: match.bn_decay_var_tensor,
+          name='freeze_moving_var')
+      graph_editor.reroute_ts(
+          [bn_decay_var_out], [match.bn_decay_var_tensor],
+          can_modify=bn_decay_var_consumers)
+
+    correction_recip = utils.smart_cond(
+        use_mv_avg,
+        lambda: array_ops.ones(correction_scale.shape),
+        lambda: correction_recip,
+        name='correction_recip')
+
+    correction_offset = utils.smart_cond(
+        use_mv_avg,
+        lambda: correction_offset,
+        lambda: array_ops.zeros(correction_offset.shape),
+        name='correction_offset')
+  return correction_scale, correction_recip, correction_offset
+
+
+def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
+                    is_training):
   """Folds in batch norm layer into preceding convolution or FC layer.
 
   Creates 3 new nodes, connects their inputs and adds them to the graph:
@@ -419,6 +743,9 @@ def _CreateFoldedOp(graph, context, has_scaling):
     context: String, batch norm context, i.e. node into which BatchNorm is
         nested.
     has_scaling: Whether the batch norm has scaling enabled.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When operation type is not supported, or input and output tensor
@@ -435,19 +762,43 @@ def _CreateFoldedOp(graph, context, has_scaling):
                                           mul_scale_name)
   op_below = mul_scale.inputs[0].op
   weights = op_below.inputs[1]
-
+  match = _GetBatchNormParams(
+      graph=graph, context=context, has_scaling=has_scaling)
+  correction_scale, correction_recip, correction_offset = None, None, None
+  if is_training:
+    correction_scale, correction_recip, correction_offset = (
+        _ComputeBatchNormCorrections(
+            context=context,
+            match=match,
+            freeze_batch_norm_delay=freeze_batch_norm_delay,
+            fused_batch_norm=False))
   # Special handling for weights of depthwise convolution.
   if op_below.type == 'DepthwiseConv2dNative':
-    new_shape = [weights.get_shape().as_list()[2],
-                 weights.get_shape().as_list()[3]]
+    new_shape = [
+        weights.get_shape().as_list()[2],
+        weights.get_shape().as_list()[3]
+    ]
     scale_name = 'mul' if has_scaling else 'Rsqrt'
-    scale = graph.get_operation_by_name(context + '/BatchNorm/batchnorm/' +
-                                        scale_name)
+    scale = graph.get_operation_by_name(
+        context + '/BatchNorm/batchnorm/' + scale_name)
     scale = array_ops.reshape(scale.outputs[0], new_shape,
                               context + '/scale_reshape')
-    mul_fold = _CloneOp(mul_scale, context + '/mul_fold',
-                        [(0, weights), (1, scale)])
+
+    if correction_scale is not None:
+      correction_scale = array_ops.reshape(correction_scale, new_shape,
+                                           context + '/correction_reshape')
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
+
+    mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights),
+                                                           (1, scale)])
   elif op_below.type in ['Conv2D', 'MatMul']:
+
+    if correction_scale is not None:
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
     mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights)])
   else:
     raise ValueError('Cannot handle operation of type: %s' % op_below.op)
@@ -456,10 +807,17 @@ def _CreateFoldedOp(graph, context, has_scaling):
   conv_or_fc_folded = _CloneOp(op_below, op_below.name + '_Fold',
                                [(1, mul_fold.outputs[0])])
 
-  add_shift = graph.get_operation_by_name(context +
-                                          '/BatchNorm/batchnorm/add_1')
-  add_fold = _CloneOp(add_shift, context + '/add_fold',
-                      [(0, conv_or_fc_folded.outputs[0])])
+  add_shift = graph.get_operation_by_name(
+      context + '/BatchNorm/batchnorm/add_1')
+
+  corrected_output = conv_or_fc_folded.outputs[0]
+  if correction_offset is not None:
+    with ops.device(conv_or_fc_folded.device):
+      corrected_output = math_ops.multiply(correction_recip, corrected_output,
+                                           context + '/post_conv_mul')
+      corrected_output = math_ops.add(corrected_output, (correction_offset),
+                                      context + '/correction_add')
+  add_fold = _CloneOp(add_shift, context + '/add_fold', [(0, corrected_output)])
   _AssertShapesMatch('add_fold', add_fold.inputs[0], add_fold.outputs[0])
   return add_shift, add_fold
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index ecf321ff573181c7a2e325770a8dde223bf0c021..330bd8a6474c18b236b635d930e7a1df9594d84f 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -46,26 +46,27 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
 
   def _RunTestOverParameters(self, test_fn):
     parameters_list = [
-        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm)
-        (nn_ops.relu6, 'Relu6', False, False, False),
-        (nn_ops.relu, 'Relu', False, False, False),
-        (nn_ops.relu6, 'Relu6', True, False, False),
-        (nn_ops.relu, 'Relu', True, False, False),
-        (nn_ops.relu6, 'Relu6', False, True, False),
-        (nn_ops.relu, 'Relu', False, True, False),
-        (nn_ops.relu6, 'Relu6', True, True, False),
-        (nn_ops.relu, 'Relu', True, True, False),
+        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm,
+        # freeze_batch_norm_delay)
+        (nn_ops.relu6, 'Relu6', False, False, False, 100),
+        (nn_ops.relu, 'Relu', False, False, False, None),
+        (nn_ops.relu6, 'Relu6', True, False, False, 100),
+        (nn_ops.relu, 'Relu', True, False, False, None),
+        (nn_ops.relu6, 'Relu6', False, True, False, 100),
+        (nn_ops.relu, 'Relu', False, True, False, None),
+        (nn_ops.relu6, 'Relu6', True, True, False, 100),
+        (nn_ops.relu, 'Relu', True, True, False, None),
         # Fused batch norm always has scaling enabled.
-        (nn_ops.relu6, 'Relu6', False, True, True),
-        (nn_ops.relu, 'Relu', False, True, True),
-        (nn_ops.relu6, 'Relu6', True, True, True),
-        (nn_ops.relu, 'Relu', True, True, True),
+        (nn_ops.relu6, 'Relu6', False, True, True, None),
+        (nn_ops.relu, 'Relu', False, True, True, 100),
+        (nn_ops.relu6, 'Relu6', True, True, True, None),
+        (nn_ops.relu, 'Relu', True, True, True, 100),
     ]
     for params in parameters_list:
-      test_fn(params[0], params[1], params[2], params[3], params[4])
+      test_fn(params[0], params[1], params[2], params[3], params[4], params[5])
 
   def _TestFoldConv2d(self, relu, relu_op_name, with_bypass, has_scaling,
-                      fused_batch_norm):
+                      fused_batch_norm, freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Args:
@@ -75,6 +76,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -99,12 +102,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -113,12 +117,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -128,7 +132,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2d)
 
   def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm):
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Tests that folding works even with an input shape where some dimensions are
@@ -141,6 +146,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -164,12 +171,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -177,12 +185,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold')
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -192,7 +200,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
   def _TestFoldFullyConnectedLayer(self, relu, relu_op_name, with_bypass,
-                                   has_scaling, fused_batch_norm):
+                                   has_scaling, fused_batch_norm,
+                                   freeze_batch_norm_delay):
     """Tests folding cases: inputs -> FC with batch norm -> Relu*.
 
     Args:
@@ -202,6 +211,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -223,12 +234,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/MatMul_Fold'])
@@ -237,12 +249,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'MatMul')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/MatMul_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -252,7 +264,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
 
   def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass,
-                               has_scaling, fused_batch_norm):
+                               has_scaling, fused_batch_norm,
+                               freeze_batch_norm_delay):
     """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*.
 
     Args:
@@ -262,6 +275,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -286,7 +301,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
@@ -295,8 +311,7 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     else:
       scale_reshape_op_name = scope + '/scale_reshape'
     self._AssertInputOpsAre(folded_mul,
-                            [scope + '/depthwise_weights/read',
-                             scale_reshape_op_name])
+                            [scope + '/correction_mult', scale_reshape_op_name])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])
 
     scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
@@ -311,12 +326,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/depthwise_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -326,7 +341,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
   def _TestCompareFoldAndUnfolded(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm):
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
     """Tests that running folded and unfolded BN returns the same results.
 
     Args:
@@ -336,6 +352,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     random_seed.set_random_seed(1234)
     unfolded_g = ops.Graph()
@@ -361,11 +379,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
       relu_node = relu(node, name='test/' + relu_op_name)
-
     folded_g = copy_graph.CopyGraph(unfolded_g)
     with folded_g.as_default():
-      fold_batch_norms.FoldBatchNorms(folded_g)
-
+      fold_batch_norms.FoldBatchNorms(
+          folded_g,
+          is_training=True,
+          freeze_batch_norm_delay=freeze_batch_norm_delay)
     with session.Session(graph=unfolded_g) as sess:
       sess.run(variables.global_variables_initializer())
       grad_node = gradients.gradients(relu_node, inputs)
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index e3581cc55905a0af7d0464bc0ec673d3ed7f0363..b458f039df0523b5b8b07cff7d14643154124b95 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -18,8 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 
-class OpTypePattern(object):
+
+class Pattern(object):
+  """The parent class of all patterns (e.g. OpTypePattern and OneofPattern)."""
+
+  @abc.abstractmethod
+  def match(self, op, tensor):
+    """Returns the result of matching op/tensor against this pattern."""
+    raise NotImplementedError('Method "match" not implemented.')
+
+
+class OpTypePattern(Pattern):
   """A tree pattern that matches TF expressions with certain op types."""
 
   def __init__(self, op_type, name=None, inputs=None):
@@ -34,7 +45,7 @@ class OpTypePattern(object):
         similar TF op types.
       name: Optional string. The name of the pattern that can be looked up in
         MatchResult.
-      inputs: Optional list of `OpTypePattern`s or strings that specify the
+      inputs: Optional list of `Pattern`s or strings that specify the
         patterns for the inputs of a matching op. If None, this pattern accepts
         any inputs of a matching op.
     """
@@ -43,22 +54,51 @@ class OpTypePattern(object):
     if inputs is None:
       inputs = []
     self._inputs = [
-        input_pattern if isinstance(input_pattern, OpTypePattern) else
-        OpTypePattern(input_pattern) for input_pattern in inputs
+        input_pattern
+        if isinstance(input_pattern, Pattern) else OpTypePattern(input_pattern)
+        for input_pattern in inputs
     ]
 
-  @property
-  def op_type(self):
-    return self._op_type
-
-  @property
-  def inputs(self):
-    return self._inputs
-
   @property
   def name(self):
     return self._name
 
+  def match(self, op, tensor):
+    if self._op_type != '*':
+      if op.type not in self._op_type.split('|'):
+        return None
+
+    match_result = MatchResult()
+    match_result.add(self, op, tensor)
+
+    if not self._inputs:
+      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
+      return match_result
+
+    if len(op.inputs) != len(self._inputs):
+      return None
+
+    for input_tensor, input_pattern in zip(op.inputs, self._inputs):
+      input_match_result = input_pattern.match(input_tensor.op, input_tensor)
+      if input_match_result is None:
+        return None
+      match_result.merge_from(input_match_result)
+    return match_result
+
+
+class OneofPattern(Pattern):
+  """Matches one of the given sub-patterns."""
+
+  def __init__(self, sub_patterns):
+    self._sub_patterns = sub_patterns
+
+  def match(self, op, tensor):
+    for sub_pattern in self._sub_patterns:
+      match_result = sub_pattern.match(op, tensor)
+      if match_result is not None:
+        return match_result
+    return None
+
 
 class MatchResult(object):
   r"""Encapsulates the result of a match done by GraphMatcher.
@@ -102,16 +142,36 @@ class MatchResult(object):
       return pattern_or_name
 
     if isinstance(pattern_or_name, str):
+      if pattern_or_name not in self._name_to_pattern:
+        return None
       return self._name_to_pattern[pattern_or_name]
 
     raise ValueError('pattern_or_name has type %s. Expect OpTypePattern or str.'
                      % type(pattern_or_name))
 
+  def _get_op_tensor(self, pattern_or_name):
+    pattern = self._to_pattern(pattern_or_name)
+    if pattern is None:
+      return None
+
+    if pattern not in self._pattern_to_op_tensor:
+      return None
+
+    return self._pattern_to_op_tensor[pattern]
+
   def get_op(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][0]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[0] if op_tensor else None
 
   def get_tensor(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][1]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[1] if op_tensor else None
+
+  def merge_from(self, other_match_result):
+    # pylint: disable=protected-access
+    self._pattern_to_op_tensor.update(other_match_result._pattern_to_op_tensor)
+    self._name_to_pattern.update(other_match_result._name_to_pattern)
+    # pylint: enable=protected-access
 
 
 class GraphMatcher(object):
@@ -121,7 +181,7 @@ class GraphMatcher(object):
     """Initializes a GraphMatcher.
 
     Args:
-      pattern: The `OpTypePattern` against which `GraphMatcher` matches
+      pattern: The `Pattern` against which `GraphMatcher` matches
         subgraphs.
     """
     self._pattern = pattern
@@ -133,7 +193,7 @@ class GraphMatcher(object):
     with key `pattern`.
 
     Args:
-      pattern: An `OpTypePattern`.
+      pattern: An `Pattern`.
       op: A `tf.Operation` to match against the pattern.
       tensor: the output `tf.Tensor` of `op` that is used by the matching op of
         `pattern`'s parent. Can be None if `pattern` is already the root of the
@@ -142,20 +202,11 @@ class GraphMatcher(object):
     Returns:
       True if an TF expression rooted at `op` matches `pattern`.
     """
-    if pattern.op_type != '*':
-      if op.type not in pattern.op_type.split('|'):
-        return False
-
-    self._match_result.add(pattern, op, tensor)
-
-    if not pattern.inputs:
-      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
-      return True
-
-    return len(op.inputs) == len(pattern.inputs) and all([
-        self._match_pattern(input_pattern, input_tensor.op, input_tensor)
-        for input_tensor, input_pattern in zip(op.inputs, pattern.inputs)
-    ])
+    match_result = pattern.match(op, tensor)
+    if match_result is None:
+      return False
+    self._match_result.merge_from(match_result)
+    return True
 
   def match_op(self, op):
     """Matches `op` against `self._pattern`.
diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py
index e1572865e423e569ee3b280036c0e02b71b70648..6d587572181c125faa02d36fb54933cff24f11c6 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher_test.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py
@@ -105,7 +105,7 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
     self.assertEqual(match_result.get_op(y1_pattern), y1.op)
     self.assertEqual(match_result.get_tensor(y1_pattern), y1)
 
-  def test_oneof_pattern(self):
+  def test_oneof_type_pattern(self):
     #   -   +
     #  / \ / \
     # x   y   z
@@ -125,6 +125,44 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
         for match_result in matcher.match_graph(g)
     ], [plus.op, minus.op])
 
+  def test_oneof_pattern(self):
+    reshape_pattern = graph_matcher.OpTypePattern('Reshape')
+    transpose_pattern = graph_matcher.OneofPattern([
+        graph_matcher.OpTypePattern(
+            'Transpose',
+            name='transpose',
+            inputs=[
+                graph_matcher.OpTypePattern(
+                    'Slice', name='slice', inputs=[reshape_pattern, '*', '*']),
+                '*'
+            ]),
+        graph_matcher.OpTypePattern(
+            'Transpose', name='transpose', inputs=[reshape_pattern, '*'])
+    ])
+
+    matcher = graph_matcher.GraphMatcher(transpose_pattern)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      transpose = array_ops.transpose(reshape)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), None)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      slicing = array_ops.slice(reshape, [0, 0], [-1, -1])
+      transpose = array_ops.transpose(slicing)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), slicing)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index bbd9743d8014ce495a4967e7484981f7e60ae4a3..89b744c559170e7d9e502d3d8610afaca2c549b7 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -52,9 +52,19 @@ def _create_graph(input_graph,
   """
   # TODO(suharshs): Describe the process in more detail in the doc string.
   g = copy_graph.CopyGraph(input_graph)
+  if is_training:
+    # TODO(raghuramank): Need to make freeze_batch_norm_delay
+    # a function of the batch size. For now setting this to 250 epochs
+    # This corresponds to 5 million steps at a batch size of 64.
+    freeze_batch_norm_delay = 5000000
+  else:
+    freeze_batch_norm_delay = None
   with g.as_default():
     with ops.device(device_name_or_function):
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g,
+          freeze_batch_norm_delay=freeze_batch_norm_delay,
+          is_training=is_training)
       quantize.Quantize(g, is_training=is_training)
   if elements is None:
     return g
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index 44998b3b6591221fde55d8d2d406d5141b1647f2..bc383a803496380aaba4d0248d2b7f93253b2b50 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -35,20 +35,34 @@ _VALID_PADDING = ["VALID", b"VALID"]
 _SAME_PADDING = ["SAME", b"SAME"]
 
 
-def _stride_size(node):
+def _stride_size(node, name_to_node):
   """Computes stride size given a TF node.
 
   Args:
     node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from variable name Tensorflow node.
 
   Returns:
     stride_x: Stride size for horizontal direction (integer).
     stride_y: Stride size for vertical direction (integer).
+
+  Raises:
+    ValueError: If stride input cannot be found in `name_to_node`.
   """
-  strides_attr = node.attr["strides"]
-  logging.vlog(4, "strides_attr = %s", strides_attr)
-  stride_y = strides_attr.list.i[1]
-  stride_x = strides_attr.list.i[2]
+  if node.op == "MaxPoolV2":
+    strides_input_name = node.input[2]
+    if not strides_input_name.endswith("/strides"):
+      raise ValueError("Strides name does not end with '/strides'")
+    strides_node = name_to_node[strides_input_name]
+    value = strides_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    stride_y = t[1]
+    stride_x = t[2]
+  else:
+    strides_attr = node.attr["strides"]
+    logging.vlog(4, "strides_attr = %s", strides_attr)
+    stride_y = strides_attr.list.i[1]
+    stride_x = strides_attr.list.i[2]
   return stride_x, stride_y
 
 
@@ -144,11 +158,12 @@ def _padding_size_conv_pool(node, kernel_size, stride, input_resolution=None):
   return total_padding, padding
 
 
-def _pool_kernel_size(node):
+def _pool_kernel_size(node, name_to_node):
   """Computes kernel size given a TF pooling node.
 
   Args:
     node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from node name to NodeDef.
 
   Returns:
     kernel_size_x: Kernel size for horizontal direction (integer).
@@ -157,13 +172,27 @@ def _pool_kernel_size(node):
   Raises:
     ValueError: If pooling is invalid.
   """
-  ksize = node.attr["ksize"]
-  kernel_size_y = ksize.list.i[1]
-  kernel_size_x = ksize.list.i[2]
-  if ksize.list.i[0] != 1:
-    raise ValueError("pool ksize for first dim is not 1")
-  if ksize.list.i[3] != 1:
-    raise ValueError("pool ksize for last dim is not 1")
+  if node.op == "MaxPoolV2":
+    ksize_input_name = node.input[1]
+    if not ksize_input_name.endswith("/ksize"):
+      raise ValueError("Kernel size name does not end with '/ksize'")
+    ksize_node = name_to_node[ksize_input_name]
+    value = ksize_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    kernel_size_y = t[1]
+    kernel_size_x = t[2]
+    if t[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if t[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
+  else:
+    ksize = node.attr["ksize"]
+    kernel_size_y = ksize.list.i[1]
+    kernel_size_x = ksize.list.i[2]
+    if ksize.list.i[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if ksize.list.i[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
   return kernel_size_x, kernel_size_y
 
 
@@ -243,7 +272,7 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
   logging.vlog(3, "node.op = %s", node.op)
   logging.vlog(4, "node = %s", node)
   if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
-    stride_x, stride_y = _stride_size(node)
+    stride_x, stride_y = _stride_size(node, name_to_node)
     kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
@@ -260,9 +289,9 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     stride_y = 1
     total_padding_x, padding_x, total_padding_y, padding_y = (
         _padding_size_pad_layer(node, name_to_node))
-  elif node.op == "MaxPool" or node.op == "AvgPool":
-    stride_x, stride_y = _stride_size(node)
-    kernel_size_x, kernel_size_y = _pool_kernel_size(node)
+  elif node.op == "MaxPool" or node.op == "MaxPoolV2" or node.op == "AvgPool":
+    stride_x, stride_y = _stride_size(node, name_to_node)
+    kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
         node, kernel_size_x, stride_x, input_resolution[1]
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
index c33804906fc21cf2573b79091a76ab1ea86f5966..2def4f3f176b8d4d26c2c94168e9698f14649d94 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <algorithm>
 #include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
+#include <algorithm>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
index 9bb1724a2c0b70ee7ce7238cc179aded95935b26..d8c0a0631d38e55ef9653e0e88e90604ec0f0329 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define Sum(a, b) ((a) + (b))
 #define Prod(a, b) ((a) * (b))
@@ -58,11 +58,11 @@ inline T negative_infinity() {
 
 }  // namespace reduce_functions
 
-#define CALL_ALL_REDUCEOPS(func, ...)                                          \
-  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                    \
-  func(Prod, functor::reduce_functions::one, ##__VA_ARGS__)                    \
-  func(Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__)       \
-  func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
+#define CALL_ALL_REDUCEOPS(func, ...)                                       \
+  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                 \
+      func(Prod, functor::reduce_functions::one, ##__VA_ARGS__) func(       \
+          Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__) \
+          func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
 
 #define ReduceSliceFunctorReduceop(reduceop, dummy)                         \
   template <typename Device, typename T, typename Index>                    \
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 8e6870fadd428ae8a1937a5c0cb43b6763f6be28..9f2be03d718364058da6b63add8752c046798c5b 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
@@ -34,9 +34,9 @@ namespace functor {
   __global__ void ReduceSliceDeviceKernel##reduceop(                           \
       Cuda3DLaunchConfig config, Index indices_width, Index bound,             \
       const T begin, const Index *indices, const T *input, T *out) {           \
-    CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {                 \
-      CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {               \
-        CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {             \
+    CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {               \
+      CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {             \
+        CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {           \
           Index outidx = x * config.virtual_thread_count.y *                   \
                              config.virtual_thread_count.z +                   \
                          y * config.virtual_thread_count.z + z;                \
@@ -68,8 +68,9 @@ namespace functor {
       if (sizex * sizey * sizez == 0) {                                        \
         return;                                                                \
       }                                                                        \
-      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(sizex, sizey, sizez, d,\
-          ReduceSliceDeviceKernel##reduceop<T, Index>, 0, 0);                  \
+      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(                       \
+          sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
+          0, 0);                                                               \
                                                                                \
       ReduceSliceDeviceKernel##reduceop<T, Index>                              \
           <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(    \
diff --git a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
index b8b56c0e229563a4e9bc930512c9fe49bd636e31..92879ab5356623dfa82fce8dff8db4d3036ae46c 100644
--- a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
@@ -87,9 +87,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1, 2, 3]
-                        [ 0, 0, 0]
-                        [41,52,63]].
+the output will be [[ 1, 2, 3]
+                    [ 0, 0, 0]
+                    [41,52,63]].
 ```
 
 The data must be at least rank 1. The indices must be of shape (?,2) where the
@@ -132,9 +132,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1,  2,  3]
-                        [ 1,  1,  1]
-                        [40,100,180]].
+the output will be [[ 1,  2,  3]
+                    [ 1,  1,  1]
+                    [40,100,180]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -189,9 +189,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
-                        [        400,         20,         60]].
+the output will be [[          1,         20,          3]
+                    [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
+                    [        400,         20,         60]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -246,9 +246,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
-                        [          1,          5,          3]].
+the output will be [[          1,         20,          3]
+                    [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
+                    [          1,          5,          3]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index e02c1b6a2bd9daf9e1f81059f7c1f92106cebc8f..63c72836d793a3df4e96a0134f3a1534c288c8c8 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -36,17 +36,12 @@ using GPUDevice = Eigen::GpuDevice;
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct Resampler2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     const int warp_batch_stride = num_sampling_points * 2;
     const int data_batch_stride = data_height * data_width * data_channels;
     const int output_batch_stride = num_sampling_points * data_channels;
@@ -59,24 +54,19 @@ struct Resampler2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto set_output = [&](const int sample_id,
-                              const int channel,
+        auto set_output = [&](const int sample_id, const int channel,
                               const T value) {
-          output[batch_id * output_batch_stride +
-                 sample_id * data_channels +
+          output[batch_id * output_batch_stride + sample_id * data_channels +
                  channel] = value;
         };
 
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -89,8 +79,7 @@ struct Resampler2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -103,12 +92,10 @@ struct Resampler2DFunctor<CPUDevice, T>{
 
             for (int chan = 0; chan < data_channels; ++chan) {
               const T img_fxfy = dx * dy * get_data_point(fx, fy, chan);
-              const T img_cxcy = (one - dx) * (one - dy) *
-                                   get_data_point(cx, cy, chan);
-              const T img_fxcy = dx * (one - dy) *
-                                   get_data_point(fx, cy, chan);
-              const T img_cxfy = (one - dx) * dy *
-                                   get_data_point(cx, fy, chan);
+              const T img_cxcy =
+                  (one - dx) * (one - dy) * get_data_point(cx, cy, chan);
+              const T img_fxcy = dx * (one - dy) * get_data_point(fx, cy, chan);
+              const T img_cxfy = (one - dx) * dy * get_data_point(cx, fy, chan);
               set_output(sample_id, chan,
                          img_fxfy + img_cxcy + img_fxcy + img_cxfy);
             }
@@ -125,8 +112,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
     // estimate of the cost of each work unit is needed to correctly shard the
     // workload. Shard assumes each cost unit is 1ns, minimum cost per shard
     // being 10us.
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, resample_batches);
@@ -138,8 +125,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
 template <typename Device, typename T>
 class ResamplerOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -158,16 +145,17 @@ class ResamplerOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
-                    "2, but shape vector is: ", warp_shape.DebugString()));
+                    "2, but shape vector is: ",
+                    warp_shape.DebugString()));
     OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0),
                 ::tensorflow::errors::InvalidArgument(
                     "Batch size of data and warp tensor must be the same, but "
-                    "input shapes are: ", data_shape.DebugString(), ", ",
-                    warp_shape.DebugString()));
+                    "input shapes are: ",
+                    data_shape.DebugString(), ", ", warp_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -180,16 +168,10 @@ class ResamplerOp : public ::tensorflow::OpKernel {
 
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::Resampler2DFunctor<Device, T>()(ctx,
-                                               ctx->eigen_device<Device>(),
-                                               data.flat<T>().data(),
-                                               warp.flat<T>().data(),
-                                               output->flat<T>().data(),
-                                               batch_size,
-                                               data_height,
-                                               data_width,
-                                               data_channels,
-                                               num_sampling_points);
+      functor::Resampler2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), output->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -197,12 +179,9 @@ class ResamplerOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp);
 };
 
-
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("Resampler")                      \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -211,40 +190,32 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("Resampler")                      \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerOp<GPUDevice, TYPE>)
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA
 
-
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct ResamplerGrad2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     // Set gradients to 0, because the kernel incrementally updates the
     // tensor entries by adding partial contributions.
-    const int resampler_output_size = batch_size * num_sampling_points *
-        data_channels;
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
     const int grad_warp_size = resampler_output_size / data_channels * 2;
-    const int grad_data_size = data_height * data_width * data_channels *
-        batch_size;
+    const int grad_data_size =
+        data_height * data_width * data_channels * batch_size;
     memset(grad_data, 0, sizeof(T) * grad_data_size);
     memset(grad_warp, 0, sizeof(T) * grad_warp_size);
 
@@ -260,35 +231,29 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
-            (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+              (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         auto update_grad_data = [&](const int x, const int y, const int chan,
                                     const T value) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
-          if (point_is_in_range){
+          if (point_is_in_range) {
             grad_data[batch_id * data_batch_stride +
-                      data_channels * (y * data_width + x) +
-                      chan] += value;
+                      data_channels * (y * data_width + x) + chan] += value;
           }
         };
 
-        auto update_grad_warp = [&](const int sample_id,
-                                    const int channel,
+        auto update_grad_warp = [&](const int sample_id, const int channel,
                                     const T value) {
-          grad_warp[batch_id * warp_batch_stride +
-                    sample_id * 2 +
-                    channel] += value;
+          grad_warp[batch_id * warp_batch_stride + sample_id * 2 + channel] +=
+              value;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -301,8 +266,7 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -316,27 +280,25 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
             for (int chan = 0; chan < data_channels; ++chan) {
               const T grad_output_value =
                   grad_output[batch_id * output_batch_stride +
-                              sample_id * data_channels +
-                              chan];
+                              sample_id * data_channels + chan];
               const T img_fxfy = get_data_point(fx, fy, chan);
               const T img_cxcy = get_data_point(cx, cy, chan);
               const T img_fxcy = get_data_point(fx, cy, chan);
               const T img_cxfy = get_data_point(cx, fy, chan);
 
               // Update partial gradients wrt relevant warp field entries
-              update_grad_warp(sample_id, 0,
-                               grad_output_value *
-                                   ((one - dy) * (img_cxcy - img_fxcy) +
-                                    dy * (img_cxfy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 0,
+                  grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) +
+                                       dy * (img_cxfy - img_fxfy)));
 
-              update_grad_warp(sample_id, 1,
-                               grad_output_value *
-                                   ((one - dx) * (img_cxcy - img_cxfy) +
-                                    dx * (img_fxcy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 1,
+                  grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) +
+                                       dx * (img_fxcy - img_fxfy)));
 
               // Update partial gradients wrt sampled data
-              update_grad_data(fx, fy, chan,
-                               grad_output_value * dx * dy);
+              update_grad_data(fx, fy, chan, grad_output_value * dx * dy);
               update_grad_data(cx, cy, chan,
                                grad_output_value * (one - dx) * (one - dy));
               update_grad_data(fx, cy, chan,
@@ -355,8 +317,8 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
     // being 10us.
     // TODO(fviola): Check out if there is a better way of doing this.
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, update_grads_for_batches);
   }
@@ -364,12 +326,11 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
 
 }  // namespace functor
 
-
 template <typename Device, typename T>
 class ResamplerGradOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -383,7 +344,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                     "tensor must be a batch of 2d data; data shape should have "
                     "4 entries corresponding to [batch_size, data_height, "
                     "data_width, data_channels], but is: ",
-                data_shape.DebugString()));
+                    data_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -394,7 +355,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
@@ -417,18 +378,11 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp));
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::ResamplerGrad2DFunctor<Device, T>()(ctx,
-                                                   ctx->eigen_device<Device>(),
-                                                   data.flat<T>().data(),
-                                                   warp.flat<T>().data(),
-                                                   grad_output.flat<T>().data(),
-                                                   grad_data->flat<T>().data(),
-                                                   grad_warp->flat<T>().data(),
-                                                   batch_size,
-                                                   data_height,
-                                                   data_width,
-                                                   data_channels,
-                                                   num_sampling_points);
+      functor::ResamplerGrad2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), grad_output.flat<T>().data(),
+          grad_data->flat<T>().data(), grad_warp->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -436,11 +390,9 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp);
 };
 
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("ResamplerGrad")                  \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerGradOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -449,11 +401,10 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("ResamplerGrad")                  \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerGradOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerGradOp<GPUDevice, TYPE>)
 // Disable half and double precision since atomicAdds are not supported
 // TF_CALL_half(REGISTER);
 // TF_CALL_double(REGISTER);
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h
index 85d3676efac70fe9237d31c2be1fe75e67d70abd..7fe3b9c0df71f51e07d38ea15a672d79fdc70453 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.h
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h
@@ -29,38 +29,25 @@ namespace functor {
 
 // Helper functor for the Resampler Op in 2D
 template <typename Device, typename T>
-struct Resampler2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct Resampler2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 // Helper functor for the Resampler Gradient Op in 2D
 template <typename Device, typename T>
-struct ResamplerGrad2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct ResamplerGrad2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
index 636847a212f27c738032128e3f3f653ec32f851b..3c07051f685c74b6e45fb782c80871f38dffbbf4 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -31,18 +31,15 @@ using GPUDevice = Eigen::GpuDevice;
 
 namespace {
 
-#define GET_DATA_POINT(x, y)                   \
-  data[batch_id * data_batch_stride +          \
-       data_channels * (y * data_width + x) +  \
+#define GET_DATA_POINT(x, y)                                                 \
+  data[batch_id * data_batch_stride + data_channels * (y * data_width + x) + \
        chan]
 
 template <typename T>
 __global__ void Resampler2DKernel(const T* __restrict__ data,
                                   const T* __restrict__ warp,
-                                  T* __restrict__ output,
-                                  const int batch_size,
-                                  const int data_height,
-                                  const int data_width,
+                                  T* __restrict__ output, const int batch_size,
+                                  const int data_height, const int data_width,
                                   const int data_channels,
                                   const int num_sampling_points) {
   const int output_data_size = batch_size * num_sampling_points * data_channels;
@@ -75,10 +72,8 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -87,21 +82,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? dx * dy * GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy =
+          (fx >= 0 && fy >= 0) ? dx * dy * GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
+                             : zero;
 
       const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
-                         : zero;
+                             ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
+                             : zero;
 
       const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
-                         : zero;
+                             ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
+                             : zero;
 
       output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy;
     } else {
@@ -115,24 +109,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  const int output_data_size = batch_size * num_sampling_points * data_channels;
-  ::tensorflow::CudaLaunchConfig config =
-      ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
-  Resampler2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, output, batch_size, data_height, data_width,
-          data_channels, num_sampling_points);
+struct Resampler2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    const int output_data_size =
+        batch_size * num_sampling_points * data_channels;
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+    Resampler2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, output, batch_size, data_height, data_width,
+            data_channels, num_sampling_points);
   }
 };
 
@@ -145,26 +135,20 @@ template struct Resampler2DFunctor<GPUDevice, double>;
 
 namespace {
 
-#define UPDATE_GRAD_DATA_POINT(x, y, v)                  \
-  atomicAdd(grad_data + (batch_id * data_batch_stride +  \
-            data_channels * (y * data_width + x) +       \
-            chan),                                       \
+#define UPDATE_GRAD_DATA_POINT(x, y, v)                                \
+  atomicAdd(grad_data + (batch_id * data_batch_stride +                \
+                         data_channels * (y * data_width + x) + chan), \
             v)
 
-
 template <typename T>
-__global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
-                                      const T* __restrict__ warp,
-                                      const T* __restrict__ grad_output,
-                                      T* __restrict__ grad_data,
-                                      T* __restrict__ grad_warp,
-                                      const int batch_size,
-                                      const int data_height,
-                                      const int data_width,
-                                      const int data_channels,
-                                      const int num_sampling_points) {
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
+__global__ void ResamplerGrad2DKernel(
+    const T* __restrict__ data, const T* __restrict__ warp,
+    const T* __restrict__ grad_output, T* __restrict__ grad_data,
+    T* __restrict__ grad_warp, const int batch_size, const int data_height,
+    const int data_width, const int data_channels,
+    const int num_sampling_points) {
+  const int resampler_output_size =
+      batch_size * num_sampling_points * data_channels;
   CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
     const int out_index = index;
 
@@ -199,10 +183,8 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -211,21 +193,17 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy = (fx >= 0 && fy >= 0) ? GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? GET_DATA_POINT(cx, cy)
+                             : zero;
 
-      const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(fx, cy)
-                         : zero;
+      const T img_fxcy =
+          (fx >= 0 && cy <= data_height - 1) ? GET_DATA_POINT(fx, cy) : zero;
 
-      const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? GET_DATA_POINT(cx, fy)
-                         : zero;
+      const T img_cxfy =
+          (cx <= data_width - 1 && fy >= 0) ? GET_DATA_POINT(cx, fy) : zero;
 
       // Update partial gradients wrt relevant warp field entries
       atomicAdd(grad_warp + warp_id_x,
@@ -241,7 +219,7 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       }
       if (cx <= data_width - 1 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(cx, cy,
-                               grad_output_value  * (one - dx) * (one - dy));
+                               grad_output_value * (one - dx) * (one - dy));
       }
       if (fx >= 0 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy));
@@ -261,43 +239,37 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  // Set gradients to 0, because the kernel incrementally updates the
-  // tensor entries by adding partial contributions.
-  const int grad_warp_size = batch_size * num_sampling_points * 2;
-  const int grad_data_size = batch_size * data_height * data_width *
-      data_channels;
-
-  ::tensorflow::CudaLaunchConfig config =
-     ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_warp_size, grad_warp);
-
-  config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_data_size, grad_data);
-
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
-  config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
-  ResamplerGrad2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, grad_output, grad_data, grad_warp, batch_size,
-          data_height, data_width, data_channels, num_sampling_points);
+struct ResamplerGrad2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    // Set gradients to 0, because the kernel incrementally updates the
+    // tensor entries by adding partial contributions.
+    const int grad_warp_size = batch_size * num_sampling_points * 2;
+    const int grad_data_size =
+        batch_size * data_height * data_width * data_channels;
+
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_warp_size, grad_warp);
+
+    config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_data_size, grad_data);
+
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
+    config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+    ResamplerGrad2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, grad_output, grad_data, grad_warp, batch_size,
+            data_height, data_width, data_channels, num_sampling_points);
   }
 };
 
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index e62501e9b100484a7be3cc6ae0fc25905c0d0724..03006dab323a7c6dc83d9a17c035ef705f7b0366 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -36,11 +36,10 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 
 namespace functor {
 template <typename T>
-void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx,
-                                     bool transa, bool transb, uint64 m,
-                                     uint64 n, uint64 k, T alpha, const T* a,
-                                     int lda, const T* b, int ldb, T beta, T* c,
-                                     int ldc) {
+void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
+                                     bool transb, uint64 m, uint64 n, uint64 k,
+                                     T alpha, const T* a, int lda, const T* b,
+                                     int ldb, T beta, T* c, int ldc) {
 #if GOOGLE_CUDA
   perftools::gputools::blas::Transpose trans[] = {
       perftools::gputools::blas::Transpose::kNoTranspose,
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.cc b/tensorflow/contrib/rnn/kernels/gru_ops.cc
index 0796f82b214620dd71d154fb8f8ec953dbcbb9ec..bd3d898fb09da0f490050c85b1e585502d8ecb2c 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/gru_ops.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -61,9 +61,9 @@ class GRUCellBlockOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -82,10 +82,10 @@ class GRUCellBlockOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -97,10 +97,10 @@ class GRUCellBlockOp : public OpKernel {
                 errors::InvalidArgument("Rank of b_ru must be 1",
                                         b_ru_tensor->dims(), " vs. 1", 1));
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1",
                                         b_c_tensor->dims(), " vs. 1"));
@@ -216,9 +216,9 @@ class GRUBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -237,10 +237,10 @@ class GRUBlockCellGradOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -253,54 +253,54 @@ class GRUBlockCellGradOp : public OpKernel {
                                         b_ru_tensor->dims(), " vs. 1"));
 
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1 ",
                                         b_c_tensor->dims(), " vs. 1"));
 
     // Shape of 'r' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("r.dims(0) != batch_size: ",
-                                r_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("r.dims(1) != cell_size: ",
-                                r_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "r.dims(0) != batch_size: ", r_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "r.dims(1) != cell_size: ", r_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'u' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("u.dims(0) != batch_size: ",
-                                u_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("u.dims(1) != cell_size: ",
-                                u_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "u.dims(0) != batch_size: ", u_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "u.dims(1) != cell_size: ", u_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'c' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("c.dims(0) != batch_size: ",
-                                c_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("c.dims(1) != cell_size: ",
-                                c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "c.dims(0) != batch_size: ", c_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "c.dims(1) != cell_size: ", c_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'd_h' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("d_h.dims(0) != batch_size: ",
-                                d_h_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("d_h.dims(1) != cell_size: ",
-                                d_h_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "d_h.dims(0) != batch_size: ", d_h_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "d_h.dims(1) != cell_size: ", d_h_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Create output tensors.
     Tensor* d_x_tensor = nullptr;
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index 941a457fd3ada312b981fb23c769ff9ecea9ff13..5e7cf0ce84d332bd24088cd78995f7843813328b 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -281,23 +281,23 @@ class LSTMBlockCellOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     // Allocate our output tensors.
     Tensor* i_tensor = nullptr;
@@ -484,77 +484,77 @@ class LSTMBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("i.dim_size(0) != batch_size: ",
-                                i_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                i_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "i.dim_size(0) != batch_size: ", i_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", i_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("cs.dim_size(0) != batch_size: ",
-                                cs_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("cs.dim_size(1) != cell_size: ",
-                                cs_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(0) != batch_size: ", cs_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(1) != cell_size: ", cs_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("f.dim_size(0) != batch_size: ",
-                                f_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                f_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "f.dim_size(0) != batch_size: ", f_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", f_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("o.dim_size(0) != batch_size: ",
-                                o_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("o.dim_size(1) != cell_size: ",
-                                o_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "o.dim_size(0) != batch_size: ", o_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "o.dim_size(1) != cell_size: ", o_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("ci.dim_size(0) != batch_size: ",
-                                ci_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("ci.dim_size(1) != cell_size: ",
-                                ci_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(0) != batch_size: ", ci_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(1) != cell_size: ", ci_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("co.dim_size(0) != batch_size: ",
-                                co_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("co.dim_size(1) != cell_size: ",
-                                co_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "co.dim_size(0) != batch_size: ", co_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "co.dim_size(1) != cell_size: ", co_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, cs_grad_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
@@ -860,9 +860,9 @@ class BlockLSTMOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     const Tensor* w_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
@@ -872,46 +872,46 @@ class BlockLSTMOp : public OpKernel {
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
     OP_REQUIRES(ctx, wci_tensor->dims() == 1,
                 errors::InvalidArgument("wci must be 1D"));
-    OP_REQUIRES(
-        ctx, wci_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wci.dim_size(0) != cell_size: ",
-                                wci_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wci_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wci.dim_size(0) != cell_size: ", wci_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wcf_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
     OP_REQUIRES(ctx, wcf_tensor->dims() == 1,
                 errors::InvalidArgument("wcf must be 1D"));
-    OP_REQUIRES(
-        ctx, wcf_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wcf.dim_size(0) != cell_size: ",
-                                wcf_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wcf_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wcf.dim_size(0) != cell_size: ", wcf_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wco_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
     OP_REQUIRES(ctx, wco_tensor->dims() == 1,
                 errors::InvalidArgument("wco must be 1D"));
-    OP_REQUIRES(
-        ctx, wco_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wco.dim_size(0) != cell_size: ",
-                                wco_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wco_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wco.dim_size(0) != cell_size: ", wco_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* b_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
     OP_REQUIRES(ctx, b_tensor->dims() == 1,
                 errors::InvalidArgument("b must be 1D"));
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     TensorShape batch_cell_shape({timelen, batch_size, cell_size});
     Tensor* i_out;
@@ -1065,9 +1065,9 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
     const int64 cell_size = w_tensor->dim_size(1) / 4;
     OP_REQUIRES(ctx, input_size + cell_size == w_tensor->dim_size(0),
-                errors::InvalidArgument("w matrix rows don't match: ",
-                                        input_size + cell_size, " vs. ",
-                                        w_tensor->dim_size(0)));
+                errors::InvalidArgument(
+                    "w matrix rows don't match: ", input_size + cell_size,
+                    " vs. ", w_tensor->dim_size(0)));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
@@ -1193,7 +1193,6 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                            batch_cell_shape, &h_grad_tensor));
 
-
     const Device& device = ctx->eigen_device<Device>();
 
     functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<float>());
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index bc6b85f3f1ab80b5ef5b4a8ba2e5242cf451adbe..d23cedc234b8c0e1a784346f28164ae79b8cbf89 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -92,7 +92,6 @@ struct TensorZeroPadding {
   }
 };
 
-
 struct LSTMBlockCell {
   LSTMBlockCell(const int batch_size, const int input_size, const int cell_size)
       : batch_size_(batch_size),
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
index 544cd163c50062093acf7f5e942f67606936c0e3..68184b643e5e7a04ffecb804703051638514b7b2 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
@@ -149,8 +149,9 @@ TEST_F(LSTMOpsTest, BlockLSTMGrad_ShapeFn) {
   INFER_ERROR("must be rank 1", op, "?;?;?;?;?;?;?;?;[1,?]" + suffix);
 
   // Output with all input knowns makes known rank outputs.
-  INFER_OK(op, JoinedCopies("?", 18), "[?,?,?];" + JoinedCopies("[?,?]", 3) +
-                                          ";" + JoinedCopies("[?]", 4));
+  INFER_OK(
+      op, JoinedCopies("?", 18),
+      "[?,?,?];" + JoinedCopies("[?,?]", 3) + ";" + JoinedCopies("[?]", 4));
 
   // Output with copies input shapes to output.
   string input = strings::StrCat("?;[?,?,?];", JoinedCopies("[?,?]", 3), ";",
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index cafeb56ad88ba83fb42faf16db8ee1035da1deac..9b84635e85e8d300be4a77a4cc261b70d14ae2ac 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.framework import test_util
 from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 
-
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 
@@ -84,19 +83,22 @@ class RNNCellTest(test.TestCase):
         ], [v.name for v in cell.trainable_variables])
         self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellNotTrainable(self):
     with self.test_session() as sess:
+
       def not_trainable_getter(getter, *args, **kwargs):
         kwargs["trainable"] = False
         return getter(*args, **kwargs)
 
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5),
+          "root",
+          initializer=init_ops.constant_initializer(0.5),
           custom_getter=not_trainable_getter):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
@@ -108,9 +110,10 @@ class RNNCellTest(test.TestCase):
             "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
         ], [v.name for v in cell.non_trainable_variables])
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testGRUCell(self):
@@ -121,9 +124,10 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.175991, 0.175991]])
       with variable_scope.variable_scope(
@@ -133,10 +137,10 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g],
-            {x.name: np.array([[1., 1., 1.]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.156736, 0.156736]])
 
@@ -148,11 +152,26 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
+
+  def testSRUCellWithDiffSize(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
+            [g], {x.name: np.array([[1., 1., 1.]]),
                   m.name: np.array([[0.1, 0.1]])})
         # Smoke test
-        self.assertAllClose(res[0], [[0.509682,  0.509682]])
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
 
   def testBasicLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
@@ -164,8 +183,7 @@ class RNNCellTest(test.TestCase):
           m = array_ops.zeros([1, 8], dtype=dtype)
           cell = rnn_cell_impl.MultiRNNCell(
               [
-                  rnn_cell_impl.BasicLSTMCell(
-                      2, state_is_tuple=False)
+                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
                   for _ in range(2)
               ],
               state_is_tuple=False)
@@ -183,22 +201,21 @@ class RNNCellTest(test.TestCase):
               "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
               rnn_cell_impl._BIAS_VARIABLE_NAME
           ]
-          self.assertEqual(
-              expected_variable_names,
-              [v.name for v in cell.trainable_variables])
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
           self.assertFalse(cell.non_trainable_variables)
           sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1.]]),
-               m.name: 0.1 * np.ones([1, 8])})
+          res = sess.run([g, out_m], {
+              x.name: np.array([[1., 1.]]),
+              m.name: 0.1 * np.ones([1, 8])
+          })
           self.assertEqual(len(res), 2)
           variables = variables_lib.global_variables()
           self.assertEqual(expected_variable_names, [v.name for v in variables])
           # The numbers in results were not calculated, this is just a
           # smoke test.
-          self.assertAllClose(
-              res[0], np.array([[0.240, 0.240]], dtype=np_dtype), 1e-2)
+          self.assertAllClose(res[0], np.array(
+              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
           expected_mem = np.array(
               [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
               dtype=np_dtype)
@@ -208,13 +225,13 @@ class RNNCellTest(test.TestCase):
           # Test BasicLSTMCell with input_size != num_units.
           x = array_ops.zeros([1, 3], dtype=dtype)
           m = array_ops.zeros([1, 4], dtype=dtype)
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              2, state_is_tuple=False)(x, m)
+          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
           res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-               m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)})
+              [g, out_m], {
+                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
+              })
           self.assertEqual(len(res), 2)
 
   def testBasicLSTMCellDimension0Error(self):
@@ -232,9 +249,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size - 1, state_size])
+              })
 
   def testBasicLSTMCellStateSizeError(self):
     """Tests that state_size must be num_units * 2."""
@@ -251,9 +270,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size, state_size])
+              })
 
   def testBasicLSTMCellStateTupleType(self):
     with self.test_session():
@@ -301,11 +322,12 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=True)
         g, (out_m0, out_m1) = cell(x, (m0, m1))
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, out_m0, out_m1], {
-            x.name: np.array([[1., 1.]]),
-            m0.name: 0.1 * np.ones([1, 4]),
-            m1.name: 0.1 * np.ones([1, 4])
-        })
+        res = sess.run(
+            [g, out_m0, out_m1], {
+                x.name: np.array([[1., 1.]]),
+                m0.name: 0.1 * np.ones([1, 4]),
+                m1.name: 0.1 * np.ones([1, 4])
+            })
         self.assertEqual(len(res), 3)
         # The numbers in results were not calculated, this is just a smoke test.
         # Note, however, these values should match the original
@@ -336,10 +358,11 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=False)
         output, state = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
-            m.name: 0.1 * np.ones((batch_size, state_size))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
+                m.name: 0.1 * np.ones((batch_size, state_size))
+            })
         self.assertEqual(len(res), 2)
         # The numbers in results were not calculated, this is mostly just a
         # smoke test.
@@ -442,10 +465,10 @@ class RNNCellTest(test.TestCase):
             rnn_cell_impl.GRUCell(3), num_proj=3)
         g, new_m = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1., 1.]]),
-             m.name: np.array([[0.1, 0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 3))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
@@ -479,9 +502,11 @@ class RNNCellTest(test.TestCase):
         base_cell = rnn_cell_impl.GRUCell(3)
         g, m_new = base_cell(x, m)
         variable_scope.get_variable_scope().reuse_variables()
+
         def residual_with_slice_fn(inp, out):
           inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
           return inp_sliced + out
+
         g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
             base_cell, residual_with_slice_fn)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
@@ -551,10 +576,10 @@ class RNNCellTest(test.TestCase):
         self.assertEqual(embedding_cell.output_size, 2)
         g, new_m = embedding_cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 2))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.17139, 0.17139]])
@@ -584,8 +609,8 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 4])
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=False)(x, m)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(ml, {
             x.name: np.array([[1., 1.]]),
@@ -605,19 +630,20 @@ class RNNCellTest(test.TestCase):
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
           rnn_cell_impl.MultiRNNCell(
-              [rnn_cell_impl.GRUCell(2)
-               for _ in range(2)], state_is_tuple=True)(x, m_bad)
+              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
 
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=True)(x, m_good)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
 
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m_good[0].name: np.array([[0.1, 0.1]]),
-            m_good[1].name: np.array([[0.1, 0.1]])
-        })
+        res = sess.run(
+            ml, {
+                x.name: np.array([[1., 1.]]),
+                m_good[0].name: np.array([[0.1, 0.1]]),
+                m_good[1].name: np.array([[0.1, 0.1]])
+            })
 
         # The numbers in results were not calculated, this is just a
         # smoke test.  However, these numbers should match those of
@@ -628,8 +654,11 @@ class RNNCellTest(test.TestCase):
 
 class DropoutWrapperTest(test.TestCase):
 
-  def _testDropoutWrapper(self, batch_size=None, time_steps=None,
-                          parallel_iterations=None, **kwargs):
+  def _testDropoutWrapper(self,
+                          batch_size=None,
+                          time_steps=None,
+                          parallel_iterations=None,
+                          **kwargs):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -640,14 +669,14 @@ class DropoutWrapperTest(test.TestCase):
           x = constant_op.constant(
               [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
           m = rnn_cell_impl.LSTMStateTuple(
-              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)
-               ] * 2)
+              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32
+                                    )] * 2)
         else:
           x = constant_op.constant(
               np.random.randn(time_steps, batch_size, 3).astype(np.float32))
           m = rnn_cell_impl.LSTMStateTuple(*[
-              constant_op.constant(
-                  [[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
+              constant_op.
+              constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
           ] * 2)
         outputs, final_state = rnn.dynamic_rnn(
             cell=rnn_cell_impl.DropoutWrapper(
@@ -674,8 +703,8 @@ class DropoutWrapperTest(test.TestCase):
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -687,8 +716,8 @@ class DropoutWrapperTest(test.TestCase):
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -703,16 +732,20 @@ class DropoutWrapperTest(test.TestCase):
     ## consistent across both calls.  Otherwise the seed may not end
     ## up being munged consistently across both graphs.
     res_standard_1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     # Clear away the graph and the test session (which keeps variables around)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2)
     res_standard_2 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     self.assertAllClose(res_standard_1[0], res_standard_2[0])
     self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
@@ -722,11 +755,12 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_none,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_none,
         state_keep_prob=keep_all)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(np.zeros(res[0].shape), res[0])
@@ -739,13 +773,13 @@ class DropoutWrapperTest(test.TestCase):
     # Even though we dropout state, by default DropoutWrapper never
     # drops out the memory ("c") term of an LSTMStateTuple.
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_all,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_none)
-    true_c_state = np.array(
-        [[1.713925, 1.713925, 1.713925]], dtype=np.float32)
+    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     self.assertAllClose(true_full_output[0], res[0][0])
     # Second output is modified by zero input state
     self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
@@ -758,13 +792,14 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     # All outputs are different because inputs are zeroed out
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_none, output_keep_prob=keep_all,
+        input_keep_prob=keep_none,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_all)
     self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
     self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
@@ -774,9 +809,13 @@ class DropoutWrapperTest(test.TestCase):
     keep_some = 0.8
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_some,
-        state_keep_prob=keep_all, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_all,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
     # Ensure the same dropout pattern for all time steps
     output_mask = np.abs(res[0]) > 1e-6
     for m in output_mask[1:]:
@@ -785,9 +824,13 @@ class DropoutWrapperTest(test.TestCase):
   def testDropoutWrapperRecurrentStateInputAndOutput(self):
     keep_some = 0.9
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
 
     # Smoke test for the state/input masks.
     output_mask = np.abs(res[0]) > 1e-6
@@ -811,17 +854,27 @@ class DropoutWrapperTest(test.TestCase):
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res0 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
 
     output_mask = np.abs(res0[0]) > 1e-6
     for time_step in output_mask:
@@ -858,9 +911,10 @@ class SlimRNNCellTest(test.TestCase):
         g, _ = rnn_cell_impl._SlimRNNCell(my_cell)(x, m)
         # pylint: enable=protected-access
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellMatch(self):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 0258d7202df20a536ae4240a532249b6b5e7e641..57521c6a9ba0b2d66639017b09c541e270276323 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 
+
 class Plus1RNNCell(rnn_lib.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
@@ -160,8 +161,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8  # unrolled up to this length
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
     self.assertEqual(len(outputs), len(inputs))
@@ -178,10 +178,9 @@ class RNNTest(test.TestCase):
         self.assertAllClose(v, input_value + 1.0)
 
       # Final state
-      self.assertAllClose(
-          values[-1],
-          max_length * np.ones(
-              (batch_size, input_size), dtype=np.float32))
+      self.assertAllClose(values[-1],
+                          max_length * np.ones(
+                              (batch_size, input_size), dtype=np.float32))
 
   def testDropout(self):
     cell = Plus1RNNCell()
@@ -191,8 +190,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("share_scope"):
       outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
@@ -207,8 +205,10 @@ class RNNTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       input_value = np.random.randn(batch_size, input_size)
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      full_dropout_values = sess.run(dropped_outputs,
-                                     feed_dict={inputs[0]: input_value})
+      full_dropout_values = sess.run(
+          dropped_outputs, feed_dict={
+              inputs[0]: input_value
+          })
 
       for v in values[:-1]:
         self.assertAllClose(v, input_value + 1.0)
@@ -222,8 +222,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("drop_scope"):
       dynamic_outputs, dynamic_state = rnn.static_rnn(
@@ -234,12 +233,16 @@ class RNNTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       dynamic_values = sess.run(
           dynamic_outputs,
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       dynamic_state_value = sess.run(
           [dynamic_state],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # outputs are fully calculated for t = 0, 1
       for v in dynamic_values[:2]:
@@ -289,8 +292,7 @@ class RNNTest(test.TestCase):
       input_size = 5
       max_length = 8  # unrolled up to this length
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope=scope)
 
@@ -316,8 +318,7 @@ class LSTMTest(test.TestCase):
       cell = rnn_cell.LSTMCell(
           num_units, initializer=initializer, state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -343,8 +344,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -374,8 +374,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -388,7 +387,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       (last_state_value, saved_state_value) = sess.run(
           [state, state_saver.saved_state["save_lstm"]],
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(last_state_value, saved_state_value)
 
   def testNoProjNoShardingTupleStateSaver(self):
@@ -406,8 +407,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=True)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -420,7 +420,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       last_and_saved_states = sess.run(
           state + (state_saver.saved_state["c"], state_saver.saved_state["m"]),
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(4, len(last_and_saved_states))
       self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
 
@@ -432,16 +434,17 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, {
-          "c0": num_units,
-          "m0": num_units,
-          "c1": num_units + 1,
-          "m1": num_units + 1,
-          "c2": num_units + 2,
-          "m2": num_units + 2,
-          "c3": num_units + 3,
-          "m3": num_units + 3
-      })
+      state_saver = TestStateSaver(
+          batch_size, {
+              "c0": num_units,
+              "m0": num_units,
+              "c1": num_units + 1,
+              "m1": num_units + 1,
+              "c2": num_units + 2,
+              "m2": num_units + 2,
+              "c3": num_units + 3,
+              "m3": num_units + 3
+          })
 
       def _cell(i):
         return rnn_cell.LSTMCell(
@@ -459,8 +462,7 @@ class LSTMTest(test.TestCase):
         self.assertEqual(len(cell.state_size[i]), 2)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
       state_names = (("c0", "m0"), ("c1", "m1"), ("c2", "m2"), ("c3", "m3"))
@@ -475,10 +477,15 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      last_states = sess.run(list(nest.flatten(state)),
-                             feed_dict={inputs[0]: input_value})
-      saved_states = sess.run(list(state_saver.saved_state.values()),
-                              feed_dict={inputs[0]: input_value})
+      last_states = sess.run(
+          list(nest.flatten(state)), feed_dict={
+              inputs[0]: input_value
+          })
+      saved_states = sess.run(
+          list(state_saver.saved_state.values()),
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(8, len(last_states))
       self.assertEqual(8, len(saved_states))
       flat_state_names = nest.flatten(state_names)
@@ -499,8 +506,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -526,8 +532,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell_notuple = rnn_cell.LSTMCell(
           num_units,
@@ -569,14 +574,20 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      outputs_notuple_v = sess.run(outputs_notuple,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_tuple_v = sess.run(outputs_tuple,
-                                 feed_dict={inputs[0]: input_value})
+      outputs_notuple_v = sess.run(
+          outputs_notuple, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_tuple_v = sess.run(
+          outputs_tuple, feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(outputs_notuple_v, outputs_tuple_v)
 
-      (state_notuple_v,) = sess.run((state_notuple,),
-                                    feed_dict={inputs[0]: input_value})
+      (state_notuple_v,) = sess.run(
+          (state_notuple,), feed_dict={
+              inputs[0]: input_value
+          })
       state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value})
       self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
 
@@ -593,8 +604,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -625,8 +635,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -661,8 +670,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       initializer = init_ops.constant_initializer(0.001)
 
@@ -721,8 +729,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -743,16 +750,21 @@ class LSTMTest(test.TestCase):
 
       self.assertEqual(len(outputs), len(inputs))
 
-      variables_lib.global_variables_initializer().run(
-          feed_dict={sequence_length: [2, 3]})
+      variables_lib.global_variables_initializer().run(feed_dict={
+          sequence_length: [2, 3]
+      })
       input_value = np.asarray(
           np.random.randn(batch_size, input_size), dtype=np.float64)
       values = sess.run(
-          outputs, feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          outputs, feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       state_value = sess.run(
-          [state], feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          [state], feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       self.assertEqual(values[0].dtype, input_value.dtype)
       self.assertEqual(state_value[0].dtype, input_value.dtype)
 
@@ -767,8 +779,7 @@ class LSTMTest(test.TestCase):
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -792,8 +803,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1 + outputs2,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1 + outputs2, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:2 * max_length]
       outputs2_values = output_values[2 * max_length:]
@@ -814,8 +827,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -833,8 +845,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:]
       self.assertEqual(len(outputs0_values), len(outputs1_values))
@@ -861,8 +875,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -939,8 +952,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -1100,8 +1112,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         static_grad_values = sess.run(static_gradients, feed_dict=feeds)
 
-        static_individual_grad_values = sess.run(static_individual_gradients,
-                                                 feed_dict=feeds)
+        static_individual_grad_values = sess.run(
+            static_individual_gradients, feed_dict=feeds)
 
         static_individual_var_grad_values = sess.run(
             static_individual_variable_gradients, feed_dict=feeds)
@@ -1148,8 +1160,10 @@ class LSTMTest(test.TestCase):
         # Generate gradients of several individual outputs w.r.t. inputs
         dynamic_individual_gradients = nest.flatten([
             gradients_impl.gradients(y, [concat_inputs])
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Generate gradients of individual variables w.r.t. inputs
@@ -1159,8 +1173,10 @@ class LSTMTest(test.TestCase):
             "Count of trainable variables: %d" % len(trainable_variables))
         dynamic_individual_variable_gradients = nest.flatten([
             gradients_impl.gradients(y, trainable_variables)
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Test forward pass
@@ -1170,8 +1186,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         dynamic_grad_values = sess.run(dynamic_gradients, feed_dict=feeds)
 
-        dynamic_individual_grad_values = sess.run(dynamic_individual_gradients,
-                                                  feed_dict=feeds)
+        dynamic_individual_grad_values = sess.run(
+            dynamic_individual_gradients, feed_dict=feeds)
 
         dynamic_individual_var_grad_values = sess.run(
             dynamic_individual_variable_gradients, feed_dict=feeds)
@@ -1207,8 +1223,8 @@ class LSTMTest(test.TestCase):
       for i, (a, b) in enumerate(
           zip(static_individual_var_grad_values,
               dynamic_individual_var_grad_values)):
-        tf_logging.info("Comparing individual variable gradients iteration %d" %
-                        i)
+        tf_logging.info(
+            "Comparing individual variable gradients iteration %d" % i)
         self.assertAllEqual(a, b)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -1223,10 +1239,7 @@ class BidirectionalRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _createBidirectionalRNN(self,
-                              use_shape,
-                              use_sequence_length,
-                              scope=None):
+  def _createBidirectionalRNN(self, use_shape, use_sequence_length, scope=None):
     num_units = 3
     input_size = 5
     batch_size = 2
@@ -1270,8 +1283,10 @@ class BidirectionalRNNTest(test.TestCase):
       # Run with pre-specified sequence length of 2, 3
       out, s_fw, s_bw = sess.run(
           [outputs, state_fw, state_bw],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1312,8 +1327,10 @@ class BidirectionalRNNTest(test.TestCase):
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
-      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
-                                 feed_dict={inputs[0]: input_value})
+      out, s_fw, s_bw = sess.run(
+          [outputs, state_fw, state_bw], feed_dict={
+              inputs[0]: input_value
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1396,13 +1413,11 @@ class BidirectionalRNNTest(test.TestCase):
                                    use_time_major, use_sequence_length):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
-          self._createBidirectionalDynamicRNN(use_shape,
-                                              use_state_tuple, use_time_major,
-                                              use_sequence_length))
+          self._createBidirectionalDynamicRNN(
+              use_shape, use_state_tuple, use_time_major, use_sequence_length))
       variables_lib.global_variables_initializer().run()
       # Run with pre-specified sequence length of 2, 3
-      feed_dict = (
-          {sequence_length: [2, 3]} if use_sequence_length else {})
+      feed_dict = ({sequence_length: [2, 3]} if use_sequence_length else {})
       feed_dict.update({inputs[0]: input_value})
       if use_state_tuple:
         out, c_fw, m_fw, c_bw, m_bw = sess.run(
@@ -1538,8 +1553,7 @@ class MultiDimensionalLSTMTest(test.TestCase):
     sequence_length = [4, 6]
     with self.test_session(graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None,) + input_size)
+          array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
       inputs_using_dim = max_length * [
           array_ops.placeholder(
@@ -1585,14 +1599,22 @@ class MultiDimensionalLSTMTest(test.TestCase):
 
       input_total_size = (batch_size,) + input_size
       input_value = np.random.randn(*input_total_size)
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={inputs[0]: input_value})
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={inputs_using_dim[0]: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={inputs_using_dim[0]: input_value})
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
 
       self.assertAllEqual(outputs_static_v, outputs_dynamic_v)
       self.assertAllEqual(outputs_static_v, outputs_sav_v)
@@ -1602,16 +1624,26 @@ class MultiDimensionalLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_static_v = sess.run(state_static,
-                                feed_dict={inputs[0]: input_value})
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={inputs[0]: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={inputs_using_dim[0]: input_value})
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              inputs[0]: input_value
+          })
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1633,16 +1665,17 @@ class NestedLSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
-          dtypes.float32, shape=(None, input_size)), array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size)))
+          dtypes.float32, shape=(None, input_size)),
+                      array_ops.placeholder(
+                          dtypes.float32, shape=(None, input_size)))
       inputs = max_length * [single_input]
       inputs_c = (array_ops.stack([input_[0] for input_ in inputs]),
                   array_ops.stack([input_[1] for input_ in inputs]))
-      single_input_using_dim = (
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)),
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)))
+      single_input_using_dim = (array_ops.placeholder(
+          dtypes.float32, shape=(batch_size, input_size)),
+                                array_ops.placeholder(
+                                    dtypes.float32,
+                                    shape=(batch_size, input_size)))
       inputs_using_dim = max_length * [single_input_using_dim]
 
       # Create a cell for the whole test. This is fine because the cell has no
@@ -1688,14 +1721,22 @@ class NestedLSTMTest(test.TestCase):
       input_total_size = (batch_size, input_size)
       input_value = (np.random.randn(*input_total_size),
                      np.random.randn(*input_total_size))
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={single_input: input_value})
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={single_input: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={single_input_using_dim: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={single_input_using_dim: input_value})
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              single_input: input_value
+          })
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              single_input: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              single_input_using_dim: input_value
+          })
 
       self.assertAllEqual(outputs_static_v,
                           np.transpose(outputs_dynamic_v, (1, 0, 2, 3)))
@@ -1706,16 +1747,26 @@ class NestedLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={single_input: input_value})
-      state_static_v = sess.run(state_static,
-                                feed_dict={single_input: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={single_input_using_dim: input_value})
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              single_input: input_value
+          })
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              single_input: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1764,8 +1815,7 @@ class StateSaverRNNTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_state_saving_rnn(
           cell,
@@ -1931,8 +1981,10 @@ class RawRNNTest(test.TestCase):
       (outputs_val, outputs_dynamic_rnn_val, final_state_val,
        final_state_dynamic_rnn_val) = sess.run(
            [outputs, outputs_dynamic_rnn, final_state, final_state_dynamic_rnn],
-           feed_dict={inputs: rand_input,
-                      sequence_length: rand_seq_len})
+           feed_dict={
+               inputs: rand_input,
+               sequence_length: rand_seq_len
+           })
 
       self.assertAllClose(outputs_dynamic_rnn_val, outputs_val)
       self.assertAllClose(final_state_dynamic_rnn_val, final_state_val)
@@ -1945,12 +1997,16 @@ class RawRNNTest(test.TestCase):
         self.assertEqual(len(gradients), len(gradients_dynamic_rnn))
         gradients_val = sess.run(
             gradients,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         gradients_dynamic_rnn_val = sess.run(
             gradients_dynamic_rnn,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         self.assertEqual(len(gradients_val), len(gradients_dynamic_rnn_val))
         input_gradients_val = gradients_val[0]
         input_gradients_dynamic_rnn_val = gradients_dynamic_rnn_val[0]
@@ -2067,14 +2123,13 @@ class RawRNNTest(test.TestCase):
 
       def loop_fn(time_, cell_output, cell_state, _):
         if cell_output is None:
-          emit_output = (array_ops.zeros(
-              [2, 3], dtype=dtypes.int32), array_ops.zeros(
-                  [unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.zeros([2, 3], dtype=dtypes.int32),
+                         array_ops.zeros([unknown_dim], dtype=dtypes.int64))
           next_state = cell.zero_state(batch_size, dtypes.float32)
         else:
-          emit_output = (array_ops.ones(
-              [batch_size, 2, 3], dtype=dtypes.int32), array_ops.ones(
-                  [batch_size, unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.ones([batch_size, 2, 3], dtype=dtypes.int32),
+                         array_ops.ones(
+                             [batch_size, unknown_dim], dtype=dtypes.int64))
           next_state = cell_state
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
@@ -2193,8 +2248,8 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     cell = rnn_cell.LSTMCell(num_units, use_peepholes=True)
     gpu_cell = DeviceWrapperCell(cell, cell_device)
-    inputs = np.random.randn(batch_size, time_steps,
-                             input_size).astype(np.float32)
+    inputs = np.random.randn(batch_size, time_steps, input_size).astype(
+        np.float32)
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
     if input_device is not None:
@@ -2262,8 +2317,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device="/cpu:0",
-        input_device=gpu_dev)
+        rnn_device="/cpu:0", cell_device="/cpu:0", input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
@@ -2278,8 +2332,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
       return  # Test requires access to a GPU
 
     gpu_dev = test.gpu_device_name()
-    run_metadata = self._execute_rnn_on(
-        input_device=gpu_dev)
+    run_metadata = self._execute_rnn_on(input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 8a3894ef9d7042e66b52edefdf08b278dcc6c4f4..7b883ebc5d7756f1bdf445f900500a4b89e6cffd 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1545,97 +1545,6 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
         ]))
 
 
-class WeightNormLSTMCellTest(test.TestCase):
-  """Compared cell output with pre-calculated values."""
-
-  def _cell_output(self, cell):
-    """Calculate cell output"""
-
-    with self.test_session() as sess:
-      init = init_ops.constant_initializer(0.5)
-      with variable_scope.variable_scope("root", initializer=init):
-        x = array_ops.zeros([1, 2])
-        c0 = array_ops.zeros([1, 2])
-        h0 = array_ops.zeros([1, 2])
-
-        state0 = rnn_cell.LSTMStateTuple(c0, h0)
-
-        xout, sout = cell()(x, state0)
-
-      sess.run([variables.global_variables_initializer()])
-      res = sess.run(
-          [xout, sout], {
-              x.name: np.array([[1., 1.]]),
-              c0.name: 0.1 * np.asarray([[0, 1]]),
-              h0.name: 0.1 * np.asarray([[2, 3]]),
-          })
-
-    actual_state_c = res[1].c
-    actual_state_h = res[1].h
-
-    return actual_state_c, actual_state_h
-
-  def testBasicCell(self):
-    """Tests cell w/o peepholes and w/o normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=False)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.65937078, 0.74983585]])
-    expected_h = np.array([[0.44923624, 0.49362513]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testNonbasicCell(self):
-    """Tests cell with peepholes and w/o normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=True)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.65937084, 0.7574988]])
-    expected_h = np.array([[0.4792085, 0.53470564]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testBasicCellWithNorm(self):
-    """Tests cell w/o peepholes and with normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=False)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.50125383, 0.58805949]])
-    expected_h = np.array([[0.32770363, 0.37397948]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testNonBasicCellWithNorm(self):
-    """Tests cell with peepholes and with normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=True)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.50125383, 0.59587258]])
-    expected_h = np.array([[0.35041603, 0.40873795]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-
 class WeightNormLSTMCellTest(test.TestCase):
   """Compared cell output with pre-calculated values."""
 
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 8adf5dce6ec76d8ac4f182929e0dfc81be946277..5fee2e93e4e575f647b3e6f132a57c57056726ad 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2729,25 +2729,9 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
 
     input_depth = inputs_shape[1].value
 
-    # Here the contributor believes that the following constraints
-    # are implied. The reasoning is explained here with reference to
-    # the paper https://arxiv.org/pdf/1709.02755.pdf upon which this
-    # implementation is based.
-    # In section 2.1 Equation 5, specifically:
-    # h_t = r_t \odot g(c_t) + (1 - r_t) \odot x_t
-    # the pointwise operation between r_t and x_t means they have
-    # the same shape (since we are implementing an RNN cell, braodcasting
-    # does not happen to input of a single timestep); by the same
-    # reasons, x_t has the same shape as h_t, essentially mandating that
-    # input_depth = unit_num.
-    if input_depth != self._num_units:
-      raise ValueError("SRU requires input_depth == num_units, got "
-                       "input_depth = %s, num_units = %s" % (input_depth,
-                                                             self._num_units))
-
     self._kernel = self.add_variable(
         rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth, 3 * self._num_units])
+        shape=[input_depth, 4 * self._num_units])
 
     self._bias = self.add_variable(
         rnn_cell_impl._BIAS_VARIABLE_NAME,
@@ -2760,8 +2744,8 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     """Simple recurrent unit (SRU) with num_units cells."""
 
     U = math_ops.matmul(inputs, self._kernel)
-    x_bar, f_intermediate, r_intermediate = array_ops.split(
-        value=U, num_or_size_splits=3, axis=1)
+    x_bar, f_intermediate, r_intermediate, x_tx = array_ops.split(
+        value=U, num_or_size_splits=4, axis=1)
 
     f_r = math_ops.sigmoid(
         nn_ops.bias_add(
@@ -2769,7 +2753,7 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     f, r = array_ops.split(value=f_r, num_or_size_splits=2, axis=1)
 
     c = f * state + (1.0 - f) * x_bar
-    h = r * self._activation(c) + (1.0 - r) * inputs
+    h = r * self._activation(c) + (1.0 - r) * x_tx
 
     return h, c
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 95dea312f3a4e77176a4bc4af290ad48c078deda..d6b5eceb479f98d2076f60f0b75e31fd43eadefa 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -924,8 +924,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(LuongMonotonicAttention, self).__init__(
-        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False, dtype=dtype),
+        query_layer=None,
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index ef3722ee41bb0b49e5f81d4d6514e2f40d2ad9f1..3245cc5e72154289ea3ba000b9a30586a7ad03a9 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -184,6 +184,7 @@ class TrainingHelper(Helper):
     """
     with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
       inputs = ops.convert_to_tensor(inputs, name="inputs")
+      self._inputs = inputs
       if not time_major:
         inputs = nest.map_structure(_transpose_batch_time, inputs)
 
@@ -200,6 +201,14 @@ class TrainingHelper(Helper):
 
       self._batch_size = array_ops.size(sequence_length)
 
+  @property
+  def inputs(self):
+    return self._inputs
+
+  @property
+  def sequence_length(self):
+    return self._sequence_length
+
   @property
   def batch_size(self):
     return self._batch_size
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py
index 3149875e41f6f77b3bcbc0ab1a150cfdc59ad2ba..69db594f8ae52e608b34cff74650889aaf41a21e 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.py
+++ b/tensorflow/contrib/session_bundle/bundle_shim.py
@@ -82,7 +82,8 @@ def _convert_default_signature_to_signature_def(signatures):
   """
   default_signature = signatures.default_signature
   signature_def = meta_graph_pb2.SignatureDef()
-  if default_signature.WhichOneof("type") == legacy_constants.REGRESSION_SIGNATURE:
+  if (default_signature.WhichOneof("type") ==
+      legacy_constants.REGRESSION_SIGNATURE):
     regression_signature = default_signature.regression_signature
     signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
     _add_input_to_signature_def(regression_signature.input.tensor_name,
@@ -91,7 +92,8 @@ def _convert_default_signature_to_signature_def(signatures):
     _add_output_to_signature_def(regression_signature.output.tensor_name,
                                  signature_constants.REGRESS_OUTPUTS,
                                  signature_def)
-  elif default_signature.WhichOneof("type") == legacy_constants.CLASSIFICATION_SIGNATURE:
+  elif (default_signature.WhichOneof("type") ==
+        legacy_constants.CLASSIFICATION_SIGNATURE):
     classification_signature = default_signature.classification_signature
     signature_def.method_name = signature_constants.CLASSIFY_METHOD_NAME
     _add_input_to_signature_def(classification_signature.input.tensor_name,
@@ -132,8 +134,10 @@ def _convert_named_signatures_to_signature_def(signatures):
       signature_constants.PREDICT_OUTPUTS]
   # TODO(pdudnik): what if there are other signatures? Mimic cr/140900781 once
   # it is submitted.
-  if (input_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE or
-      output_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE):
+  if (input_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE or
+      output_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE):
     raise RuntimeError("Named input and output signatures can only be "
                        "up-converted if they are generic signature. "
                        "Input signature type is %s, output signature type is "
diff --git a/tensorflow/contrib/session_bundle/bundle_shim_test.cc b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
index 72f32a0f5554e4dd3e7cbf498a57ee6bfba57211..9a1dd9303f43591888dc49984d81c4a0c6af9846 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim_test.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
@@ -493,17 +493,15 @@ TEST(BundleShimTest, DefaultAndNamedSignatureWithPredict) {
   ASSERT_FALSE(
       actual_signature_def_predict->second.inputs().find("foo-input") ==
       actual_signature_def_predict->second.inputs().end());
-  EXPECT_EQ("foo-input",
-            actual_signature_def_predict->second.inputs()
-                .find("foo-input")
-                ->second.name());
+  EXPECT_EQ("foo-input", actual_signature_def_predict->second.inputs()
+                             .find("foo-input")
+                             ->second.name());
   ASSERT_FALSE(
       actual_signature_def_predict->second.outputs().find("foo-output") ==
       actual_signature_def_predict->second.outputs().end());
-  EXPECT_EQ("foo-output",
-            actual_signature_def_predict->second.outputs()
-                .find("foo-output")
-                ->second.name());
+  EXPECT_EQ("foo-output", actual_signature_def_predict->second.outputs()
+                              .find("foo-output")
+                              ->second.name());
   EXPECT_EQ(kPredictMethodName,
             actual_signature_def_predict->second.method_name());
 }
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index f6f663aae766b783b85139f57a93e10f553e6bd1..08983337fccc138d40eb959cecc5bf9e47cf6cac 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -281,11 +281,12 @@ class Exporter(object):
     tmp_export_dir = compat.as_text(export_dir) + "-tmp"
     gfile.MakeDirs(tmp_export_dir)
 
-    self._saver.save(sess,
-                     os.path.join(
-                         compat.as_text(tmp_export_dir),
-                         compat.as_text(constants.EXPORT_BASE_NAME)),
-                     meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
+    self._saver.save(
+        sess,
+        os.path.join(
+            compat.as_text(tmp_export_dir),
+            compat.as_text(constants.EXPORT_BASE_NAME)),
+        meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
 
     # Run the asset callback.
     if self._assets_callback and self._assets_to_copy:
@@ -301,12 +302,12 @@ class Exporter(object):
     if exports_to_keep:
       # create a simple parser that pulls the export_version from the directory.
       def parser(path):
-        if os.name == 'nt':
-          match = re.match("^" + export_dir_base.replace('\\','/') + "/(\\d{8})$",
-                           path.path.replace('\\','/'))
+        if os.name == "nt":
+          match = re.match(
+              "^" + export_dir_base.replace("\\", "/") + "/(\\d{8})$",
+              path.path.replace("\\", "/"))
         else:
-          match = re.match("^" + export_dir_base + "/(\\d{8})$",
-                           path.path)
+          match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path)
         if not match:
           return None
         return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/session_bundle/signature.cc b/tensorflow/contrib/session_bundle/signature.cc
index 7133875ad53e77625bbe799f4f886c074a08f1bd..ed70a5b91b231067e8e69951ef7010406e6b22cf 100644
--- a/tensorflow/contrib/session_bundle/signature.cc
+++ b/tensorflow/contrib/session_bundle/signature.cc
@@ -38,9 +38,9 @@ namespace {
 Status BatchSizesMatch(const Tensor& input, const Tensor& output) {
   // Ensure the number of outputs match the number of inputs.
   if (input.dim_size(0) != output.dim_size(0)) {
-    return errors::Internal(
-        strings::StrCat("Input batch size did not match output batch size: ",
-                        input.dim_size(0), " vs. ", output.dim_size(0)));
+    return errors::Internal(strings::StrCat(
+        "Input batch size did not match output batch size: ", input.dim_size(0),
+        " vs. ", output.dim_size(0)));
   }
   return Status::OK();
 }
@@ -100,8 +100,8 @@ Status GetNamedClassificationSignature(
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   if (!it->second.has_classification_signature()) {
     return errors::FailedPrecondition(
@@ -232,8 +232,8 @@ Status GetNamedSignature(const string& name,
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   *signature = it->second;
   return Status::OK();
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 870f504d10362ed5226951adefc3ba9a934900c1..f5a9299d263450ba89617f38bf7a4c5cbc359cb1 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -236,7 +236,7 @@ class SingleEvaluationTest(test.TestCase):
   def _prepareCheckpoint(self, checkpoint_path):
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
-    saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
+    saver = saver_lib.Saver()
     with self.test_session() as sess:
       sess.run(init_op)
       saver.save(sess, checkpoint_path)
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 4e816f9b11be2986d042f336bdc320ff47d8cc49..831c6e427ae78932bec09cea935f05a87723f1a3 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -197,9 +197,7 @@ class MultiplyGradientsTest(test.TestCase):
     gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
     variable = variables_lib.Variable(array_ops.zeros_like(gradient))
     multiplier_flag = variables_lib.Variable(True)
-    tensor_multiplier = array_ops.where(multiplier_flag,
-                                        self._multiplier,
-                                        1.0)
+    tensor_multiplier = array_ops.where(multiplier_flag, self._multiplier, 1.0)
     grad_to_var = (gradient, variable)
     gradient_multipliers = {variable: tensor_multiplier}
 
@@ -212,11 +210,8 @@ class MultiplyGradientsTest(test.TestCase):
       sess.run(multiplier_flag.assign(False))
       gradient_false_flag = sess.run(grad_to_var[0])
     np_testing.assert_almost_equal(gradient_true_flag,
-                                   self._multiplied_grad_vec,
-                                   5)
-    np_testing.assert_almost_equal(gradient_false_flag,
-                                   self._grad_vec,
-                                   5)
+                                   self._multiplied_grad_vec, 5)
+    np_testing.assert_almost_equal(gradient_false_flag, self._grad_vec, 5)
 
 
 def LogisticClassifier(inputs):
@@ -502,6 +497,7 @@ class TrainTest(test.TestCase):
     purpose.
     """
     dump_root = tempfile.mkdtemp()
+
     def dumping_wrapper(sess):  # pylint: disable=invalid-name
       return dumping_wrapper_lib.DumpingDebugWrapperSession(sess, dump_root)
 
@@ -519,16 +515,13 @@ class TrainTest(test.TestCase):
       train_op = learning.create_train_op(total_loss, optimizer)
 
       loss = learning.train(
-          train_op,
-          None,
-          number_of_steps=1,
-          session_wrapper=dumping_wrapper)
+          train_op, None, number_of_steps=1, session_wrapper=dumping_wrapper)
     self.assertIsNotNone(loss)
 
     run_root = glob.glob(os.path.join(dump_root, 'run_*'))[-1]
     dump = debug_data.DebugDumpDir(run_root)
-    self.assertAllEqual(
-        0, dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
+    self.assertAllEqual(0,
+                        dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
 
   def testTrainWithTrace(self):
     logdir = os.path.join(
@@ -961,8 +954,8 @@ class TrainTest(test.TestCase):
     self.assertGreater(losses[0], losses[1])
 
   def testTrainWithEpochLimit(self):
-    logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()),
-                          'tmp_logs')
+    logdir = os.path.join(
+        tempfile.mkdtemp(prefix=self.get_temp_dir()), 'tmp_logs')
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
@@ -982,7 +975,8 @@ class TrainTest(test.TestCase):
     self.assertIsNotNone(loss)
     self.assertLess(loss, .015)
     self.assertTrue(os.path.isfile('{}/model.ckpt-300.index'.format(logdir)))
-    self.assertTrue(os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
+    self.assertTrue(
+        os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index c8b4e472c99e0bf081a7222a7976b1fbbb680825..360e7dbe75f595ff61fb83379089294371203813 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -105,8 +105,8 @@ class SparsemaxLossTest(test.TestCase):
     tf_loss_op, tf_loss_out = self._tf_sparsemax_loss(z, q, dtype, use_gpu)
     np_loss = self._np_sparsemax_loss(z, q).astype(dtype)
 
-    self.assertAllCloseAccordingToType(np_loss, tf_loss_out,
-                                       half_atol=1e-2, half_rtol=5e-3)
+    self.assertAllCloseAccordingToType(
+        np_loss, tf_loss_out, half_atol=1e-2, half_rtol=5e-3)
     self.assertShapeEqual(np_loss, tf_loss_op)
 
   def _test_constant_add(self, dtype, random, use_gpu):
@@ -116,17 +116,17 @@ class SparsemaxLossTest(test.TestCase):
     q = np.zeros((test_obs, 10))
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
 
-    _, tf_loss_zpc = self._tf_sparsemax_loss(
-        z + c, q, dtype, use_gpu
-    )
+    _, tf_loss_zpc = self._tf_sparsemax_loss(z + c, q, dtype, use_gpu)
 
-    _, tf_loss_z = self._tf_sparsemax_loss(
-        z, q, dtype, use_gpu
-    )
+    _, tf_loss_z = self._tf_sparsemax_loss(z, q, dtype, use_gpu)
 
-    self.assertAllCloseAccordingToType(tf_loss_zpc, tf_loss_z,
-                                       float_atol=5e-6, float_rtol=5e-6,
-                                       half_atol=1e-2, half_rtol=1e-2)
+    self.assertAllCloseAccordingToType(
+        tf_loss_zpc,
+        tf_loss_z,
+        float_atol=5e-6,
+        float_rtol=5e-6,
+        half_atol=1e-2,
+        half_rtol=1e-2)
 
   def _test_sparsemax_loss_positive(self, dtype, random, use_gpu):
     """check sparsemax-loss proposition 4"""
@@ -170,10 +170,7 @@ class SparsemaxLossTest(test.TestCase):
 
     with self.test_session(use_gpu=use_gpu):
       err = gradient_checker.compute_gradient_error(
-        logits, z.shape,
-        loss_op, (test_obs, ),
-        x_init_value=z, delta=1e-9
-      )
+          logits, z.shape, loss_op, (test_obs,), x_init_value=z, delta=1e-9)
 
     self.assertLess(err, 1e-4)
 
@@ -192,8 +189,8 @@ class SparsemaxLossTest(test.TestCase):
       tf_grad = loss_grad_op.eval()
       np_grad = self._np_sparsemax_loss_grad(z, q).astype(dtype)
 
-      self.assertAllCloseAccordingToType(np_grad, tf_grad,
-                                         half_atol=1e-2, half_rtol=5e-3)
+      self.assertAllCloseAccordingToType(
+          np_grad, tf_grad, half_atol=1e-2, half_rtol=5e-3)
       self.assertShapeEqual(np_grad, loss_grad_op)
 
   def _test_dtype(self, dtype):
@@ -220,5 +217,6 @@ class SparsemaxLossTest(test.TestCase):
   def testDouble(self):
     self._test_dtype('float64')
 
-if __name__ == "__main__":
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index 82d36ee9cb21fb822e6df0c3632c49a4fd616825..259e62bd864fba3cc7d9aa387e02c8319438d658 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -83,8 +83,8 @@ class SparsemaxTest(test.TestCase):
     tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(z, dtype, use_gpu)
     p_sparemax = self._np_sparsemax(z).astype(dtype)
 
-    self.assertAllCloseAccordingToType(p_sparemax, tf_sparsemax_out,
-                                       half_atol=5e-3)
+    self.assertAllCloseAccordingToType(
+        p_sparemax, tf_sparsemax_out, half_atol=5e-3)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
   def _test_sparsemax_of_zero(self, dtype, random, use_gpu):
@@ -111,9 +111,8 @@ class SparsemaxTest(test.TestCase):
     p_expected = np.zeros((test_obs, 10), dtype=dtype)
     p_expected[np.arange(0, test_obs), z_sort_arg[:, 0]] = 1
 
-    tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(
-        (1 / epsilon) * z, dtype, use_gpu
-    )
+    tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax((1 / epsilon) * z,
+                                                           dtype, use_gpu)
 
     self.assertAllCloseAccordingToType(p_expected, tf_sparsemax_out)
     self.assertShapeEqual(p_expected, tf_sparsemax_op)
@@ -123,16 +122,12 @@ class SparsemaxTest(test.TestCase):
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
     c = random.uniform(low=-3, high=3, size=(test_obs, 1)).astype(dtype)
 
-    _, tf_sparsemax_zpc = self._tf_sparsemax(
-        z + c, dtype, use_gpu
-    )
+    _, tf_sparsemax_zpc = self._tf_sparsemax(z + c, dtype, use_gpu)
 
-    _, tf_sparsemax_z = self._tf_sparsemax(
-        z, dtype, use_gpu
-    )
+    _, tf_sparsemax_z = self._tf_sparsemax(z, dtype, use_gpu)
 
-    self.assertAllCloseAccordingToType(tf_sparsemax_zpc, tf_sparsemax_z,
-                                       half_atol=5e-3)
+    self.assertAllCloseAccordingToType(
+        tf_sparsemax_zpc, tf_sparsemax_z, half_atol=5e-3)
 
   def _test_permutation(self, dtype, random, use_gpu):
     """check sparsemax proposition 3"""
@@ -143,12 +138,11 @@ class SparsemaxTest(test.TestCase):
       per = random.permutation(10)
 
       tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(
-        z[i, per].reshape(1, -1), dtype, use_gpu
-      )
+          z[i, per].reshape(1, -1), dtype, use_gpu)
       p_expected = p[i, per].reshape(1, -1)
 
-      self.assertAllCloseAccordingToType(p_expected, tf_sparsemax_out,
-                                         half_atol=5e-3)
+      self.assertAllCloseAccordingToType(
+          p_expected, tf_sparsemax_out, half_atol=5e-3)
       self.assertShapeEqual(p_expected, tf_sparsemax_op)
 
   def _test_diffrence(self, dtype, random, use_gpu):
@@ -166,18 +160,14 @@ class SparsemaxTest(test.TestCase):
             continue
 
           self.assertTrue(
-            0 <= p[val, j] - p[val, i] <= z[val, j] - z[val, i] + etol,
-            "0 <= %.10f <= %.10f" % (
-              p[val, j] - p[val, i], z[val, j] - z[val, i] + etol
-            )
-          )
+              0 <= p[val, j] - p[val, i] <= z[val, j] - z[val, i] + etol,
+              '0 <= %.10f <= %.10f' % (p[val, j] - p[val, i],
+                                       z[val, j] - z[val, i] + etol))
 
   def _test_two_dimentional(self, dtype, random, use_gpu):
     """check two dimentation sparsemax case"""
     t = np.linspace(-2, 2, test_obs, dtype=dtype)
-    z = np.vstack([
-      t, np.zeros(test_obs, dtype=dtype)
-    ]).T
+    z = np.vstack([t, np.zeros(test_obs, dtype=dtype)]).T
 
     tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(z, dtype, use_gpu)
 
@@ -196,10 +186,7 @@ class SparsemaxTest(test.TestCase):
 
     with self.test_session(use_gpu=use_gpu):
       err = gradient_checker.compute_gradient_error(
-        logits, z.shape,
-        sparsemax_op, z.shape,
-        x_init_value=z, delta=1e-9
-      )
+          logits, z.shape, sparsemax_op, z.shape, x_init_value=z, delta=1e-9)
 
     self.assertLess(err, 1e-4)
 
@@ -248,5 +235,6 @@ class SparsemaxTest(test.TestCase):
   def testDouble(self):
     self._test_dtype('float64')
 
-if __name__ == "__main__":
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index ee661dfdc11451bb72bc2741b0b54ebf5c1e6543..a6968d8b2a67809e3e63d099ad9448efd619b4d9 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -202,7 +202,7 @@ def create_file_writer(logdir,
     if flush_millis is None:
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
-      filename_suffix = constant_op.constant("")
+      filename_suffix = constant_op.constant(".v2")
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index a998ac1e111090a3702c0499a54ef1a5c1b3ac90..4abcc20ed334e706c8ae59e2127dfd6f4e152361 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
@@ -43,8 +43,8 @@ from tensorflow.python.training import training_util
 KEYS_NAME = 'keys'
 LOSS_NAME = 'rf_training_loss'
 TREE_PATHS_PREDICTION_KEY = 'tree_paths'
-VARIANCE_PREDICTION_KEY = 'regression_variance'
-
+VARIANCE_PREDICTION_KEY = 'prediction_variance'
+ALL_SERVING_KEY = 'tensorforest_all'
 EPSILON = 0.000001
 
 
@@ -134,7 +134,8 @@ def get_model_fn(params,
                  trainer_id=0,
                  report_feature_importances=False,
                  local_eval=False,
-                 head_scope=None):
+                 head_scope=None,
+                 include_all_in_serving=False):
   """Return a model function given a way to construct a graph builder."""
   if model_head is None:
     model_head = get_default_head(params, weights_name)
@@ -238,7 +239,13 @@ def get_model_fn(params,
       model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
 
     model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
-
+    if include_all_in_serving:
+      # In order to serve the variance we need to add the prediction dict
+      # to output_alternatives dict.
+      if not model_ops.output_alternatives:
+        model_ops.output_alternatives = {}
+      model_ops.output_alternatives[ALL_SERVING_KEY] = (
+          constants.ProblemType.UNSPECIFIED, model_ops.predictions)
     return model_ops
 
   return _model_fn
@@ -293,7 +300,8 @@ class TensorForestEstimator(estimator.Estimator):
                report_feature_importances=False,
                local_eval=False,
                version=None,
-               head=None):
+               head=None,
+               include_all_in_serving=False):
     """Initializes a TensorForestEstimator instance.
 
     Args:
@@ -339,6 +347,23 @@ class TensorForestEstimator(estimator.Estimator):
       version: Unused.
       head: A heads_lib.Head object that calculates losses and such. If None,
         one will be automatically created based on params.
+      include_all_in_serving: if True, allow preparation of the complete
+        prediction dict including the variance to be exported for serving with
+        the Servo lib; and it also requires calling export_savedmodel with
+        default_output_alternative_key=ALL_SERVING_KEY, i.e.
+        estimator.export_savedmodel(export_dir_base=your_export_dir,
+          serving_input_fn=your_export_input_fn,
+          default_output_alternative_key=ALL_SERVING_KEY)
+        if False, resort to default behavior, i.e. export scores and
+          probabilities but no variances. In this case
+          default_output_alternative_key should be None while calling
+          export_savedmodel().
+        Note, that due to backward compatibility we cannot always set
+        include_all_in_serving to True because in this case calling
+        export_saved_model() without
+        default_output_alternative_key=ALL_SERVING_KEY (legacy behavior) the
+        saved_model_export_utils.get_output_alternatives() would raise
+        ValueError.
 
     Returns:
       A `TensorForestEstimator` instance.
@@ -357,7 +382,9 @@ class TensorForestEstimator(estimator.Estimator):
             num_trainers=num_trainers,
             trainer_id=trainer_id,
             report_feature_importances=report_feature_importances,
-            local_eval=local_eval),
+            local_eval=local_eval,
+            include_all_in_serving=include_all_in_serving,
+        ),
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index 76cfb4c9ca02269f9fee61c767acc6cb4a0b4ca7..cf0db788a419f64ed891df8aa097fa8826f6de91 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -99,18 +99,17 @@ class HardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -125,9 +124,8 @@ class HardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -144,12 +142,11 @@ class HardRoutingFunction : public OpKernel {
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
         float dot_product = 0.0;
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
index 28f50f1a32eb1827a242d527cd42c58487877959..f64155fa55af22d57c6619d8a39da0455dc0de65 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
@@ -85,12 +85,9 @@ REGISTER_OP("KFeatureGradient")
 
 class KFeatureGradient : public OpKernel {
  public:
-  explicit KFeatureGradient(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+  explicit KFeatureGradient(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -101,14 +98,14 @@ class KFeatureGradient : public OpKernel {
     const Tensor& routing_tensor = context->input(3);
 
     // Extract dimensions from input tensors.
-    const int32 num_data = static_cast<int32>(
-        input_data_tensor.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data_tensor.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
-    const int32 num_features_per_node = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(1));
+    const int32 num_data =
+        static_cast<int32>(input_data_tensor.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data_tensor.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_features_per_node =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(1));
 
     // Construct output tensors.
     Tensor* out_routes = nullptr;
@@ -127,12 +124,12 @@ class KFeatureGradient : public OpKernel {
     out_weights_shape.AddDim(num_nodes);
     out_weights_shape.AddDim(num_features_per_node);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, out_routes_shape, &out_routes));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, out_data_shape, &out_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, out_weights_shape, &out_weights));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, out_routes_shape, &out_routes));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, out_data_shape, &out_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(2, out_weights_shape, &out_weights));
 
     tensorforest::Initialize(*out_data, 0.0f);
 
@@ -148,18 +145,13 @@ class KFeatureGradient : public OpKernel {
 
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data_tensor.Slice(i, i+1);
+      const Tensor point = input_data_tensor.Slice(i, i + 1);
       feature_set.clear();
 
       // Traverse the tree from the bottom up.
       for (int j = num_nodes - 1; j >= 0; j--) {
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            j,
-            random_seed_,
-            num_features,
-            num_features_per_node,
-            &feature_set);
+        tensorforest::GetFeatureSet(layer_num_, j, random_seed_, num_features,
+                                    num_features_per_node, &feature_set);
 
         // Compute routing gradient.
         // j is a leaf node.
@@ -170,12 +162,8 @@ class KFeatureGradient : public OpKernel {
           int32 right_child = left_child + 1;
 
           float left_prob = LeftProbabilityK(
-              point,
-              feature_set,
-              tree_parameters_tensor.Slice(j, j+1),
-              tree_biases(j),
-              num_features,
-              num_features_per_node);
+              point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+              tree_biases(j), num_features, num_features_per_node);
 
           float right_prob = 1.0f - left_prob;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
index 9bc42eb61fae013de3e4ea73aaf371cdaa4ccf9a..e7cafb144da84865ad2b4ea0c33866ddb89119a5 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
@@ -43,7 +43,6 @@ using shape_inference::ShapeHandle;
 using tensorforest::CheckTensorBounds;
 using tensorforest::LeftProbabilityK;
 
-
 // The term 'routing function' is synonymous with 'the probability
 // that an instance is routed to each leaf node.'  It is defined in
 // 'Deep Neural Decision Forests' by Kontschieder et al.
@@ -96,10 +95,8 @@ class KFeatureRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("max_nodes", &max_nodes_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features_per_node",
                                              &num_features_per_node_));
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -108,27 +105,25 @@ class KFeatureRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
@@ -136,30 +131,22 @@ class KFeatureRoutingFunction : public OpKernel {
     // Iteratively compute the probability of reaching each leaf.
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0f;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
         feature_set.clear();
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            i,
-            random_seed_,
-            num_features,
-            num_features_per_node_,
-            &feature_set);
-
-        int32 left_child = 2*j + 1;
+        tensorforest::GetFeatureSet(layer_num_, i, random_seed_, num_features,
+                                    num_features_per_node_, &feature_set);
+
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbabilityK(point,
-                                           feature_set,
-                                           tree_parameters_tensor.Slice(j, j+1),
-                                           tree_biases(j),
-                                           num_features,
-                                           num_features_per_node_);
+        float left_prob = LeftProbabilityK(
+            point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+            tree_biases(j), num_features, num_features_per_node_);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0f - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
index 4027e732b3f52585c2149c3cdc71535664f04ed4..0c2eaabe8f3e1e1377a8d5c5308aaec00030a20f 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
@@ -90,46 +90,43 @@ class RoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
 
     // Iteratively compute the probability of reaching each leaf.
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
-        int32 left_child = 2*j + 1;
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0 - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index 66aa293dc1cb93b82f06d838ad7b0f9c09761585..c9df09bfda44e665ed013da383e1e9a2c665c454 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -96,10 +96,9 @@ class StochasticHardRoutingFunction : public OpKernel {
   explicit StochasticHardRoutingFunction(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tree_depth", &tree_depth_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
     single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(random_seed_));
+        new random::PhiloxRandom(random_seed_));
     rng_ = std::unique_ptr<random::SimplePhilox>(
         new random::SimplePhilox(single_rand_.get()));
   }
@@ -111,20 +110,19 @@ class StochasticHardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -139,9 +137,8 @@ class StochasticHardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -150,19 +147,18 @@ class StochasticHardRoutingFunction : public OpKernel {
     // Stochastically traverse the tree to a leaf.
 
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       int32 node = 0;
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         if (left_prob < rng_->RandFloat()) {
           CHECK_LT(i, num_data);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index 0b5afe464f4b9608af0feca584aaa799f5980f46..b0d8b832b5437db7a4b3026e80ae99d0391d7f7a 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -149,14 +149,14 @@ class StochasticHardRoutingGradient : public OpKernel {
     TensorShape output_bias_shape;
     output_bias_shape.AddDim(num_data);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, output_routing_shape, &output_routing));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, output_data_shape, &output_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, output_parameters_shape, &output_parameters));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        3, output_bias_shape, &output_bias));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_routing_shape,
+                                                     &output_routing));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_data_shape, &output_data));
+    OP_REQUIRES_OK(context, context->allocate_output(2, output_parameters_shape,
+                                                     &output_parameters));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(3, output_bias_shape, &output_bias));
 
     tensorforest::Initialize(*output_routing, 0.0);
     tensorforest::Initialize(*output_data, 0.0);
@@ -178,7 +178,7 @@ class StochasticHardRoutingGradient : public OpKernel {
       const Tensor point = input_data.Slice(i, i + 1);
 
       // Traverses the tree from the bottom up.
-      for (int j = tree_depth_-1; j > -1; j--) {
+      for (int j = tree_depth_ - 1; j > -1; j--) {
         int32 node = path(i, j);
 
         CHECK_LT(node, num_nodes);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
index cacad03e274c3279eb3706e71e1bcdf8433ca1ef..25825a78a1498490009fe4ff6bbfc67493727037 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
@@ -64,8 +64,7 @@ REGISTER_OP("UnpackPath")
 
 class UnpackPath : public OpKernel {
  public:
-  explicit UnpackPath(OpKernelConstruction* context)
-      : OpKernel(context) {}
+  explicit UnpackPath(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     VLOG(1) << "unpack start";
@@ -73,8 +72,8 @@ class UnpackPath : public OpKernel {
     const Tensor& path_values_tensor = context->input(1);
 
     const int32 num_data = static_cast<int32>(path_tensor.shape().dim_size(0));
-    const int32 tree_depth = static_cast<int32>(
-        path_tensor.shape().dim_size(1));
+    const int32 tree_depth =
+        static_cast<int32>(path_tensor.shape().dim_size(1));
 
     const int32 num_nodes = MathUtil::IPow(2, tree_depth) - 1;
 
@@ -107,7 +106,6 @@ class UnpackPath : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU),
-                        UnpackPath);
+REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU), UnpackPath);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
index c091a73c4e48a47bdccea3ec99371faab9c586c2..34388fe1aab72895a805141ec66a71ecf0f42ba4 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
@@ -25,9 +25,7 @@ namespace tensorforest {
 
 using tensorflow::Tensor;
 
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
@@ -41,11 +39,8 @@ float LeftProbability(const Tensor& point,
   return 1.0 / (1.0 + exp(-dot_product + bias));
 }
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
index c5902184f95ea8f97be4a10d1101a38333359d44..69a0143a4e319157a4526ca80fbb3f6472902b31 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
@@ -24,16 +24,11 @@ namespace tensorflow {
 namespace tensorforest {
 
 // Returns the probability that the point falls to the left.
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features);
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k);
 
 // Returns a random set of num_features_to_pick features in the
@@ -49,5 +44,3 @@ void GetFeatureSet(int32 tree_num, int32 node_num, int32 random_seed,
 }  // namespace tensorflow
 
 #endif  // LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
index 47b49a379c4b7a17d35b52c1403f67c2f07aeeaf..b21a9179777c21f65435e136aa6082e27fb3b78c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
@@ -30,15 +30,13 @@ namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
 
-
 float Convert(const string& in) {
   const std::size_t intval = std::hash<string>()(in);
   return static_cast<float>(intval);
 }
 
-
-void Evaluate(const Tensor& input_data, Tensor output_data,
-              int32 start, int32 end) {
+void Evaluate(const Tensor& input_data, Tensor output_data, int32 start,
+              int32 end) {
   auto out_data = output_data.unaligned_flat<float>();
   const auto in_data = input_data.unaligned_flat<string>();
 
@@ -59,9 +57,8 @@ class ReinterpretStringToFloat : public OpKernel {
     if (!CheckTensorBounds(context, input_data)) return;
 
     Tensor* output_data = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_data.shape(),
-                                            &output_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, input_data.shape(), &output_data));
 
     // Evaluate input data in parallel.
     const int32 num_data = static_cast<int32>(input_data.NumElements());
@@ -73,8 +70,8 @@ class ReinterpretStringToFloat : public OpKernel {
       auto work = [&input_data, output_data, num_data](int64 start, int64 end) {
         CHECK(start <= end);
         CHECK(end <= num_data);
-        Evaluate(input_data, *output_data,
-                 static_cast<int32>(start), static_cast<int32>(end));
+        Evaluate(input_data, *output_data, static_cast<int32>(start),
+                 static_cast<int32>(end));
       };
       Shard(num_threads, worker_threads->workers, num_data, 100, work);
     }
diff --git a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
index dd2a98b08cdb486c98c161390a3a1f81d31e1f4b..60740c2be3703141805c7eae0ac384edf934ab3d 100644
--- a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
@@ -22,7 +22,6 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/platform/logging.h"
 
-
 namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
@@ -38,20 +37,19 @@ class ScatterAddNdim : public OpKernel {
 
     if (indices_tensor.shape().dim_size(0) > 0) {
       OP_REQUIRES(context, indices_tensor.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "indices should be two-dimensional"));
+                  errors::InvalidArgument("indices should be two-dimensional"));
       const int32 delta_dims = deltas_tensor.shape().dims();
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(1) + delta_dims ==
-          input_tensor.shape().dims() + 1,
+              input_tensor.shape().dims() + 1,
           errors::InvalidArgument(
               "Number of indices dimensions should be the same as input "
               "rank."));
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(0) ==
-          deltas_tensor.shape().dim_size(0),
+              deltas_tensor.shape().dim_size(0),
           errors::InvalidArgument(
               "Number of updates should be same as number of indices."));
     } else {
@@ -68,8 +66,8 @@ class ScatterAddNdim : public OpKernel {
     const auto indices = indices_tensor.tensor<int32, 2>();
     const auto deltas = deltas_tensor.unaligned_flat<float>();
 
-    const int32 num_dims = static_cast<int32>(
-        indices_tensor.shape().dim_size(1));
+    const int32 num_dims =
+        static_cast<int32>(indices_tensor.shape().dim_size(1));
 
     // Figure out if indices don't specify a complete position in the
     // input tensor.
@@ -80,10 +78,9 @@ class ScatterAddNdim : public OpKernel {
 
     // Calculate index multipliers.
     std::vector<int32> multipliers;
-    OP_REQUIRES(
-        context, input.size() < std::numeric_limits<int32>::max(),
-        errors::InvalidArgument(
-            "Input must contain less than 2^31 total elements"));
+    OP_REQUIRES(context, input.size() < std::numeric_limits<int32>::max(),
+                errors::InvalidArgument(
+                    "Input must contain less than 2^31 total elements"));
     int32 last_size = static_cast<int32>(input.size());
 
     for (int32 j = 0; j < num_dims; j++) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 94e12cea5a072f0746e642196d55f3a3b13a16c3..44997ec5d6d5fdb9aab52ab7a50f46a731bfda66 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -65,8 +65,8 @@ void GetTwoBest(int max, const std::function<float(int)>& score_fn,
 
 float ClassificationSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits,
-    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights,
-    int32 num_classes, int i) {
+    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights, int32 num_classes,
+    int i) {
   Eigen::array<int, 1> offsets;
   // Class counts are stored with the total in [0], so the length of each
   // count vector is num_classes + 1.
@@ -74,7 +74,7 @@ float ClassificationSplitScore(
   Eigen::array<int, 1> extents;
   extents[0] = num_classes;
   return WeightedGiniImpurity(splits.slice(offsets, extents)) +
-      WeightedGiniImpurity(rights.slice(offsets, extents));
+         WeightedGiniImpurity(rights.slice(offsets, extents));
 }
 
 void GetTwoBestClassification(const Tensor& total_counts,
@@ -90,29 +90,28 @@ void GetTwoBestClassification(const Tensor& total_counts,
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc = total_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc =
+      total_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
 
   // TODO(gilberth): See if we can delay evaluation here by templating the
   // arguments to ClassificationSplitScore.
-  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits = split_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits =
+      split_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   Eigen::array<int, 1> bcast;
   bcast[0] = num_splits;
   const Eigen::Tensor<float, 1, Eigen::RowMajor> rights =
       tc.broadcast(bcast) - splits;
 
-  std::function<float(int)> score_fn = std::bind(
-      ClassificationSplitScore, splits, rights, num_classes,
-      std::placeholders::_1);
+  std::function<float(int)> score_fn =
+      std::bind(ClassificationSplitScore, splits, rights, num_classes,
+                std::placeholders::_1);
 
   GetTwoBest(num_splits, score_fn, best_score, best_index, second_best_score,
              second_best_index);
 }
 
-int32 BestFeatureClassification(
-    const Tensor& total_counts, const Tensor& split_counts,
-    int32 accumulator) {
+int32 BestFeatureClassification(const Tensor& total_counts,
+                                const Tensor& split_counts, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -130,8 +129,7 @@ float RegressionSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits_square,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_sums,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_squares,
-    int32 accumulator,
-    int32 num_regression_dims, int i) {
+    int32 accumulator, int32 num_regression_dims, int i) {
   Eigen::array<int, 1> offsets = {i * num_regression_dims + 1};
   Eigen::array<int, 1> extents = {num_regression_dims - 1};
   float left_count = splits_count_accessor(accumulator, i, 0);
@@ -141,15 +139,15 @@ float RegressionSplitScore(
 
   // Guard against divide-by-zero.
   if (left_count > 0) {
-    score += WeightedVariance(
-        splits_sum.slice(offsets, extents),
-        splits_square.slice(offsets, extents), left_count);
+    score +=
+        WeightedVariance(splits_sum.slice(offsets, extents),
+                         splits_square.slice(offsets, extents), left_count);
   }
 
   if (right_count > 0) {
-    score += WeightedVariance(right_sums.slice(offsets, extents),
-                              right_squares.slice(offsets, extents),
-                              right_count);
+    score +=
+        WeightedVariance(right_sums.slice(offsets, extents),
+                         right_squares.slice(offsets, extents), right_count);
   }
   return score;
 }
@@ -159,20 +157,20 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
                           int32 accumulator, float* best_score, int* best_index,
                           float* second_best_score, int* second_best_index) {
   const int32 num_splits = static_cast<int32>(split_sums.shape().dim_size(1));
-  const int32 num_regression_dims = static_cast<int32>(
-      split_sums.shape().dim_size(2));
+  const int32 num_regression_dims =
+      static_cast<int32>(split_sums.shape().dim_size(2));
   // Ideally, Eigen::Tensor::chip would be best to use here but it results
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc_sum = total_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto tc_square = total_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_sum = split_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_square = split_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_sum =
+      total_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_square =
+      total_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_sum =
+      split_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_square =
+      split_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   // Eigen is infuriating to work with, usually resulting in all kinds of
   // unhelpful compiler errors when trying something that seems sane.  This
   // helps us do a simple thing like access the first element (the counts)
@@ -193,10 +191,10 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
              best_score, best_index, second_best_score, second_best_index);
 }
 
-int32 BestFeatureRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+int32 BestFeatureRegression(const Tensor& total_sums,
+                            const Tensor& total_squares,
+                            const Tensor& split_sums,
+                            const Tensor& split_squares, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -207,10 +205,11 @@ int32 BestFeatureRegression(
   return best_feature_index;
 }
 
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator) {
   // TODO(thomaswc): Implement this, probably as part of v3.
   return false;
 }
@@ -599,7 +598,6 @@ bool Decide(float value, float bias, DataColumnTypes type) {
   }
 }
 
-
 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
                            float parent_sum, const float* parent_data,
                            float valid_leaf_threshold, int num_outputs,
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index dad9df4898844eaa17bdfe5b4b298a95377fd12e..edbac6700677633cbd4d41f7040b4859ca599c4a 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -45,13 +45,10 @@ const int32 LEAF_NODE = -1;
 const int32 FREE_NODE = -2;
 
 // Used to indicate column types, e.g. categorical vs. float
-enum DataColumnTypes {
-  kDataFloat = 0,
-  kDataCategorical = 1
-};
+enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 };
 
 // Calculates the sum of a tensor.
-template<typename T>
+template <typename T>
 T Sum(Tensor counts) {
   Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum =
       counts.unaligned_flat<T>().sum();
@@ -97,7 +94,7 @@ float WeightedGiniImpurity(const T& counts) {
   return RawWeightedGiniImpurity(smoothed);
 }
 
-template<typename T1, typename T2>
+template <typename T1, typename T2>
 float WeightedVariance(const T1& sums, const T2& squares, float count) {
   const auto e_x = sums / count;
   const auto e_x2 = squares / count;
@@ -120,10 +117,11 @@ int32 BestFeatureRegression(const Tensor& total_sums,
 
 // Returns true if the best split's variance is sufficiently smaller than
 // that of the next best split.
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator);
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator);
 
 // Performs booststrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
@@ -178,10 +176,8 @@ bool DecideNode(const GetFeatureFnType& get_dense,
 // isn't present in sparse_input_indices.  sparse_input_indices is assumed
 // to be sorted.
 template <typename T1, typename T2>
-float FindSparseValue(
-    const T1& sparse_input_indices,
-    const T2& sparse_input_values,
-    int32 i, int32 j) {
+float FindSparseValue(const T1& sparse_input_indices,
+                      const T2& sparse_input_values, int32 i, int32 j) {
   int32 low = 0;
   int32 high = sparse_input_values.dimension(0);
   while (low < high) {
@@ -273,7 +269,6 @@ int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
 // categorical data, it is value != bias.
 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat);
 
-
 // Returns true if all the splits are initialized. Since they get initialized
 // in order, we can simply infer this from the last split.
 // This should only be called for a single allocator's candidate features
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
index 7485a695dfba93fd3f57c19096b205b10e2fa8b5..08553545502c21eb8f2d68bfd342f8ba7c081adb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
@@ -44,11 +44,13 @@ TEST(TestWeightedVariance, Basic) {
   Tensor squares = test::AsTensor<float>({29, 12}, {2});
 
   EXPECT_FLOAT_EQ(WeightedVariance(sums.unaligned_flat<float>(),
-                                   squares.unaligned_flat<float>(), 3), 2.0);
+                                   squares.unaligned_flat<float>(), 3),
+                  2.0);
 
   Tensor zero = test::AsTensor<float>({0}, {1});
   EXPECT_FLOAT_EQ(WeightedVariance(zero.unaligned_flat<float>(),
-                                   zero.unaligned_flat<float>(), 1), 0);
+                                   zero.unaligned_flat<float>(), 1),
+                  0);
 }
 
 TEST(TestInitialize, Basic) {
@@ -94,17 +96,16 @@ TEST(BestFeatureClassification, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         30, 10, 10, 10,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       30, 10, 10, 10, 10, 0, 0, 10, 19, 5, 6, 8,  // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals = test::AsTensor<float>(
+      {1, 5, 6, 7, 0, 0, 0, 0, 30, 10, 10, 10,  // this one
+       -1, -1, -1, -1},
+      {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  30, 10,
+                             10, 10, 10, 0,  0,  10, 19, 5,  6,  8,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 1);
 }
@@ -114,17 +115,16 @@ TEST(BestFeatureClassification, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // When counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         18, 6, 6, 6,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       9, 3, 3, 3, 9, 3, 3, 3, 9, 3, 3, 3,     // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 18, 6, 6, 6,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4, 0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 3,
+                             3,  3,  9,  3,  3,  3,  9,  3,  3,  3,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 0);
 }
@@ -133,36 +133,34 @@ TEST(BestFeatureRegression, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 9, 8, 5, 9, 0, 0, 0, 0,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  9,  8,  5,  9,  0,  0,  0,  0,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   // lower the variance by lowering one of the squares just a little.
-  Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 43, 0, 0, 0, 0,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_squares =
+      test::AsTensor<float>(
+          {1,   2,  3,  4,  1,   2,  3,  4,  1,  2,  3,  4,
+           0,   0,  0,  0,  0,   0,  0,  0,  0,  0,  0,  0,
+           100, 50, 40, 45, 100, 50, 40, 43, 0,  0,  0,  0,  // this one
+           -1,  -1, -1, -1, -1,  -1, -1, -1, -1, -1, -1, -1},
+          {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 1);
+                                  split_squares, 2),
+            1);
 }
 
 TEST(BestFeatureRegression, NoWinner) {
@@ -170,37 +168,33 @@ TEST(BestFeatureRegression, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // when counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 10, 8, 6, 9, 10, 8, 6, 9,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  10, 8,  6,  9,  10, 8,  6,  9,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+      {1,   2,  3,  4,  1,   2,  3,  4,  1,   2,  3,  4,
+       0,   0,  0,  0,  0,   0,  0,  0,  0,   0,  0,  0,
+       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,  // this one
+       -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1},
       {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 0);
+                                  split_squares, 2),
+            0);
 }
 
 }  // namespace tensorforest
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
index 81e2a1b2a1b720574210e376fa786923367794a6..f4a7058ddb8bfdd6393a9369006aabc29d058d3b 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
@@ -14,8 +14,8 @@
 // =============================================================================
 #include "tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h"
 
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
@@ -58,8 +58,7 @@ CandidateGraphRunner::CandidateGraphRunner(
   // Features don't change, store them in a tensor.
   const auto& oblique = split.inequality_left_child_test().oblique();
   const int32 feat_size = oblique.features_size();
-  features_.reset(
-      new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
+  features_.reset(new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
   auto feat = features_->flat<int32>();
   int i = 0;
   for (const auto& id : oblique.features()) {
@@ -67,10 +66,10 @@ CandidateGraphRunner::CandidateGraphRunner(
   }
 }
 
-void CandidateGraphRunner::RunOp(
-    const string& name, const TensorNameValueList& inputs,
-    const std::vector<string>& output_tensor_names,
-    std::vector<Tensor>* outputs) {
+void CandidateGraphRunner::RunOp(const string& name,
+                                 const TensorNameValueList& inputs,
+                                 const std::vector<string>& output_tensor_names,
+                                 std::vector<Tensor>* outputs) {
   std::vector<string> op_name;
   if (name != kNoOp) {
     op_name.push_back(name);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index cced26b9036ba8ba6c5994b7483261a062f80588..328af28725af016e90b30ae2d303ffba15c81c1f 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -26,7 +26,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Keep a tree ensemble in memory for efficient evaluation and mutation.
 class DecisionTreeResource : public ResourceBase {
  public:
@@ -35,15 +34,12 @@ class DecisionTreeResource : public ResourceBase {
 
   string DebugString() override {
     return strings::StrCat("DecisionTree[size=",
-                           decision_tree_->decision_tree().nodes_size(),
-                           "]");
+                           decision_tree_->decision_tree().nodes_size(), "]");
   }
 
   void MaybeInitialize();
 
-  const decision_trees::Model& decision_tree() const {
-    return *decision_tree_;
-  }
+  const decision_trees::Model& decision_tree() const { return *decision_tree_; }
 
   decision_trees::Model* mutable_decision_tree() {
     return decision_tree_.get();
@@ -59,9 +55,7 @@ class DecisionTreeResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-    decision_tree_.reset(new decision_trees::Model());
-  }
+  void Reset() { decision_tree_.reset(new decision_trees::Model()); }
 
   mutex* get_mutex() { return &mu_; }
 
@@ -84,7 +78,6 @@ class DecisionTreeResource : public ResourceBase {
   std::vector<std::unique_ptr<DecisionNodeEvaluator>> node_evaluators_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index 85ce7b825b11983307370bb3ac30eeec9b6b2c99..bf2b2aaa3c8f433ab4fc145217857112f7a0a579 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -22,7 +22,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Base class for evaluators of decision nodes that effectively copy proto
 // contents into C++ structures for faster execution.
 class DecisionNodeEvaluator {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
index 5c49b87443e7b1f4ef532256ae2efdc9fa985d8a..af5cf72a3c0bea0eef45c3446acf52ff389c6751 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
@@ -20,11 +20,11 @@
 namespace tensorflow {
 namespace {
 
+using tensorflow::decision_trees::InequalityTest;
+using tensorflow::decision_trees::MatchingValuesTest;
 using tensorflow::tensorforest::InequalityDecisionNodeEvaluator;
 using tensorflow::tensorforest::MatchingValuesDecisionNodeEvaluator;
 using tensorflow::tensorforest::ObliqueInequalityDecisionNodeEvaluator;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::MatchingValuesTest;
 
 TEST(InequalityDecisionNodeEvaluatorTest, TestLessOrEqual) {
   InequalityTest test;
@@ -124,4 +124,3 @@ TEST(ObliqueDecisionNodeEvaluatorTest, Basic) {
 
 }  // namespace
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index 0d6712e9e552d7045eb198f7e65d04eb42eff920..eea0be27caf0a022ba7acaacd359c75a2df4eedb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -40,9 +40,7 @@ class FertileStatsResource : public ResourceBase {
     model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
   }
 
-  string DebugString() override {
-    return "FertileStats";
-  }
+  string DebugString() override { return "FertileStats"; }
 
   void ExtractFromProto(const FertileStats& stats);
 
@@ -50,8 +48,7 @@ class FertileStatsResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-  }
+  void Reset() {}
 
   // Reset the stats for a node, but leave the leaf_stats intact.
   void ResetSplitStats(int32 node_id, int32 depth) {
@@ -84,7 +81,6 @@ class FertileStatsResource : public ResourceBase {
   // was found.
   bool BestSplit(int32 node_id, SplitCandidate* best, int32* depth);
 
-
  private:
   mutex mu_;
   std::shared_ptr<LeafModelOperator> model_op_;
@@ -94,7 +90,6 @@ class FertileStatsResource : public ResourceBase {
   void AllocateNode(int32 node_id, int32 depth);
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
index 3ce630e3a9691b87ad291a9f29616f741953dd84..da600d34eacdf27514709240723e5bb730cfe7f0 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
@@ -20,7 +20,6 @@
 #include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
 
-
 namespace tensorflow {
 namespace tensorforest {
 
@@ -454,14 +453,14 @@ void DenseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     class_stats->add_value()->set_float_value(total_counts_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
                            ->mutable_classification()
                            ->mutable_dense_counts();
     for (int i = 0; i < num_outputs_; ++i) {
-       left_stats->add_value()->set_float_value(left_count(split_num, i));
+      left_stats->add_value()->set_float_value(left_count(split_num, i));
     }
   }
 }
@@ -546,7 +545,7 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     (*class_stats)[entry.first] = val;
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
@@ -561,8 +560,8 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
   }
 }
 
-float SparseClassificationGrowStats::GiniScore(
-    int split, float* left_sum, float* right_sum) const {
+float SparseClassificationGrowStats::GiniScore(int split, float* left_sum,
+                                               float* right_sum) const {
   float left_square = 0, right_square = 0;
   *left_sum = 0;
   *right_sum = 0;
@@ -844,12 +843,11 @@ void LeastSquaresRegressionGrowStats::PackToProto(FertileSlot* slot) const {
     total_squares->add_value()->set_float_value(total_sum_squares_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
-    auto* sums = cand->mutable_left_stats()
-                           ->mutable_regression()
-                           ->mutable_mean_output();
+    auto* sums =
+        cand->mutable_left_stats()->mutable_regression()->mutable_mean_output();
     auto* squares = cand->mutable_left_stats()
                         ->mutable_regression()
                         ->mutable_mean_output_squares();
@@ -891,20 +889,17 @@ float LeastSquaresRegressionGrowStats::SplitVariance(int split) const {
   float total_variance = 0;
   for (int i = 0; i < params_.num_outputs(); ++i) {
     // Left side
-    const float le_x =
-        left_sum(split, i) / left_counts_[split];
+    const float le_x = left_sum(split, i) / left_counts_[split];
 
-    const float le_x2 =
-        left_square(split, i) / left_counts_[split];
+    const float le_x2 = left_square(split, i) / left_counts_[split];
     total_variance += le_x2 - le_x * le_x;
 
     // Right side
     const float re_x = (total_sum_[i] - left_sum(split, i)) /
                        (weight_sum_ - left_counts_[split]);
 
-    const float re_x2 =
-        (total_sum_squares_[i] - left_square(split, i)) /
-        (weight_sum_ - left_counts_[split]);
+    const float re_x2 = (total_sum_squares_[i] - left_square(split, i)) /
+                        (weight_sum_ - left_counts_[split]);
     total_variance += re_x2 - re_x * re_x;
   }
   return total_variance;
@@ -937,8 +932,7 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   left->set_weight_sum(left_counts_[best_index]);
   auto* left_output_sum = left_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    left_output_sum->add_value()->set_float_value(
-        left_sum(best_index, i));
+    left_output_sum->add_value()->set_float_value(left_sum(best_index, i));
   }
 
   // Right
@@ -947,8 +941,8 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   right->set_weight_sum(weight_sum_ - left_counts_[best_index]);
   auto* right_output_sum = right_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    right_output_sum->add_value()->set_float_value(
-        total_sum_[i] - left_sum(best_index, i));
+    right_output_sum->add_value()->set_float_value(total_sum_[i] -
+                                                   left_sum(best_index, i));
   }
   return true;
 }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index f938d08c84d72b4c5a71e8f7fb0f639aa70e3e49..04e6b0a735320dd024e326a94ef910593a326245 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -73,21 +73,15 @@ class GrowStats {
       const InputTarget* target, int example) {}
   void RemoveSplit(int split_num);
 
-  int num_splits() const {
-    return splits_.size();
-  }
+  int num_splits() const { return splits_.size(); }
 
-  float weight_sum() const {
-    return weight_sum_;
-  }
+  float weight_sum() const { return weight_sum_; }
 
   virtual bool IsInitialized() const {
     return weight_sum_ > 0 || splits_.size() == num_splits_to_consider_;
   }
 
-  int32 depth() const {
-    return depth_;
-  }
+  int32 depth() const { return depth_; }
 
  protected:
   GrowStats(const TensorForestParams& params, int32 depth);
@@ -206,8 +200,8 @@ class ClassificationStats : public GrowStats {
   virtual float left_count(int split, int class_num) const = 0;
   virtual float right_count(int split, int class_num) const = 0;
 
-  virtual void ClassificationAddLeftExample(
-      int split, int64 int_label, float weight) = 0;
+  virtual void ClassificationAddLeftExample(int split, int64 int_label,
+                                            float weight) = 0;
   virtual void ClassificationAddRightExample(int split, int64 int_label,
                                              float weight) {
     // Does nothing by default, but sub-classes can override.
@@ -316,7 +310,7 @@ class DenseClassificationGrowStats : public ClassificationStats {
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
@@ -375,15 +369,13 @@ class SparseClassificationGrowStats : public ClassificationStats {
   SparseClassificationGrowStats(const TensorForestParams& params, int32 depth)
       : ClassificationStats(params, depth) {}
 
-  void Initialize() override {
-    Clear();
-  }
+  void Initialize() override { Clear(); }
 
   void ExtractFromProto(const FertileSlot& slot) override;
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
@@ -562,9 +554,9 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   }
   void RemoveSplitStats(int split_num) override {
     left_sums_.erase(left_sums_.begin() + num_outputs_ * split_num,
-                       left_sums_.begin() + num_outputs_ * (split_num + 1));
+                     left_sums_.begin() + num_outputs_ * (split_num + 1));
     left_squares_.erase(left_squares_.begin() + num_outputs_ * split_num,
-                       left_squares_.begin() + num_outputs_ * (split_num + 1));
+                        left_squares_.begin() + num_outputs_ * (split_num + 1));
     left_counts_.erase(left_counts_.begin() + split_num,
                        left_counts_.begin() + (split_num + 1));
   }
@@ -605,7 +597,6 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   std::vector<int64> left_counts_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
index ceb58d2ead5c2f148c96d9cb9532a73688593d33..26e989928e00de1b2ae1646abf216adfbec2be4f 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
@@ -24,21 +24,21 @@
 namespace tensorflow {
 namespace {
 
-using tensorflow::tensorforest::GrowStats;
-using tensorflow::tensorforest::TestableInputTarget;
-using tensorflow::tensorforest::FertileSlot;
+using tensorflow::decision_trees::BinaryNode;
+using tensorflow::decision_trees::FeatureId;
+using tensorflow::decision_trees::InequalityTest;
 using tensorflow::tensorforest::DenseClassificationGrowStats;
-using tensorflow::tensorforest::SparseClassificationGrowStats;
+using tensorflow::tensorforest::FertileSlot;
 using tensorflow::tensorforest::FixedSizeClassStats;
 using tensorflow::tensorforest::FixedSizeSparseClassificationGrowStats;
+using tensorflow::tensorforest::GrowStats;
 using tensorflow::tensorforest::LeastSquaresRegressionGrowStats;
-using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::SparseClassificationGrowStats;
 using tensorflow::tensorforest::SPLIT_FINISH_BASIC;
 using tensorflow::tensorforest::SPLIT_FINISH_DOMINATE_HOEFFDING;
 using tensorflow::tensorforest::SPLIT_PRUNE_HOEFFDING;
-using tensorflow::decision_trees::BinaryNode;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::FeatureId;
+using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 BinaryNode MakeSplit(const string& feat, float val) {
   BinaryNode split;
@@ -52,8 +52,7 @@ BinaryNode MakeSplit(const string& feat, float val) {
   return split;
 }
 
-void RunBatch(GrowStats* stats,
-              const TestableInputTarget* target) {
+void RunBatch(GrowStats* stats, const TestableInputTarget* target) {
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
       new tensorflow::tensorforest::TestableDataSet(
           {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2));
@@ -102,18 +101,10 @@ class TestableRunningStats : public DenseClassificationGrowStats {
   TestableRunningStats(const TensorForestParams& params, int32 depth)
       : DenseClassificationGrowStats(params, depth) {}
 
-  float test_left_sum(int split) {
-    return get_left_gini()->sum(split);
-  }
-  float test_left_square(int split) {
-    return get_left_gini()->square(split);
-  }
-  float test_right_sum(int split) {
-    return get_right_gini()->sum(split);
-  }
-  float test_right_square(int split) {
-    return get_right_gini()->square(split);
-  }
+  float test_left_sum(int split) { return get_left_gini()->sum(split); }
+  float test_left_square(int split) { return get_left_gini()->square(split); }
+  float test_right_sum(int split) { return get_right_gini()->sum(split); }
+  float test_right_square(int split) { return get_right_gini()->square(split); }
 };
 
 TEST(GrowStatsDenseClassificationTest, BasicRunningStats) {
@@ -166,9 +157,7 @@ class TestableFinishEarly : public DenseClassificationGrowStats {
   int num_times_called_;
 
  protected:
-  void CheckFinishEarlyHoeffding() override {
-    ++num_times_called_;
-  }
+  void CheckFinishEarlyHoeffding() override { ++num_times_called_; }
 };
 
 TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
@@ -212,7 +201,6 @@ TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
   ASSERT_EQ(stat->num_times_called_, 9);
 }
 
-
 TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   TensorForestParams params;
   params.set_num_outputs(2);
@@ -224,7 +212,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   finish->set_type(SPLIT_FINISH_BASIC);
   finish->mutable_check_every_steps()->set_constant_value(100);
   params.mutable_pruning_type()->set_type(SPLIT_PRUNE_HOEFFDING);
-  params.mutable_pruning_type()->mutable_prune_every_samples()
+  params.mutable_pruning_type()
+      ->mutable_prune_every_samples()
       ->set_constant_value(1);
 
   // On each iteration, we add two examples, one of class 0 and one
@@ -234,8 +223,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   std::vector<float> weights = {1, 1};
   TestableInputTarget target(labels, weights, 1);
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
-      new tensorflow::tensorforest::TestableDataSet(
-          {-1.0, -1.0, 1.0, -1.0}, 2));
+      new tensorflow::tensorforest::TestableDataSet({-1.0, -1.0, 1.0, -1.0},
+                                                    2));
 
   DenseClassificationGrowStats stats(params, 1);
   stats.Initialize();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
index 14cb19d36f33e478728aba3e28b7bea11b691d34..d43884481afbbbc988d6eb80e01e49663df6914b 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
@@ -21,8 +21,6 @@ namespace tensorflow {
 namespace tensorforest {
 namespace {
 
-const int32 SPARSE_DEFAULT = 0;
-
 bool DecideInequalityTest(const decision_trees::InequalityTest& test,
                           float value) {
   float bias = test.threshold().float_value();
@@ -111,10 +109,10 @@ void TensorDataSet::set_input_tensors(const Tensor& dense,
     dense_data_.reset(new DenseStorageType(dense.tensor<float, 2>()));
   }
   if (sparse_indices.shape().dims() == 2) {
-    sparse_indices_.reset(new SparseIndicesStorageType(
-        sparse_indices.tensor<int64, 2>()));
-    sparse_values_.reset(new SparseValuesStorageType(
-        sparse_values.tensor<float, 1>()));
+    sparse_indices_.reset(
+        new SparseIndicesStorageType(sparse_indices.tensor<int64, 2>()));
+    sparse_values_.reset(
+        new SparseValuesStorageType(sparse_values.tensor<float, 1>()));
     sparse_batch_size_ = sparse_shape.tensor<int64, 1>()(0);
   }
   original_dense_tensor_ = dense;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index eafad6b591672f67ae816405ff603f9aaba30a1b..c544a8c75e9bfe8fe6bbea8913e7be17d868bfef 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -93,9 +93,7 @@ class TensorDataSet {
   // an int32 you can avoid the atoi32.
   virtual float GetExampleValue(int example, int32 feature_id) const;
 
-  int num_features() {
-    return available_features_.size();
-  }
+  int num_features() { return available_features_.size(); }
 
   const Tensor& original_tensor() const { return original_dense_tensor_; }
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
index 44ec09c50ef3d092bd1bf7f051f492e1fffdd05b..d4402b6055a36d38042a0e6cfa07b532ec11c093 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
@@ -79,9 +79,7 @@ class TensorInputTarget : public StoredInputTarget<SingleDimStorageType> {
     return (*target_)(example_index * num_targets_ + target_index);
   }
 
-  const Tensor& original_tensor() const {
-    return original_tensor_;
-  }
+  const Tensor& original_tensor() const { return original_tensor_; }
 
  protected:
   Tensor original_tensor_;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
index d43c068e462ff78b114fb29bd8cf0ee0c6080fcd..83614a25314117ef9ba29b4dcf6ebee8f7f3e226 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
@@ -160,6 +160,5 @@ void RegressionLeafModelOperator::ExportModel(
   }
 }
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
index ffd92c01f9a59719e6bb2458c2f28253c364a2e8..ab4191809b6a7400114acf85991c74acfac55505 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
@@ -26,19 +26,19 @@ namespace {
 using tensorflow::decision_trees::Leaf;
 using tensorflow::tensorforest::DenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafModelOperator;
-using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
-using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafStat;
 using tensorflow::tensorforest::RegressionLeafModelOperator;
-using tensorflow::tensorforest::TestableInputTarget;
+using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
+using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 const int32 kNumClasses = 3;
 
 constexpr char kRegressionStatProto[] =
-  "weight_sum: 3 "
-  "regression { "
-  "mean_output { "
+    "weight_sum: 3 "
+    "regression { "
+    "mean_output { "
     "value { "
     "  float_value: 27 "
     "} "
@@ -48,8 +48,8 @@ constexpr char kRegressionStatProto[] =
     "value { "
     "  float_value: 10 "
     "} "
-  "} "
-  "mean_output_squares { "
+    "} "
+    "mean_output_squares { "
     "value {"
     "  float_value: 245"
     "}"
@@ -59,8 +59,8 @@ constexpr char kRegressionStatProto[] =
     "value {"
     "  float_value: 46"
     "}"
-  "}"
-"}";
+    "}"
+    "}";
 
 void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   Leaf l;
@@ -83,7 +83,6 @@ void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   EXPECT_FLOAT_EQ(op->GetOutputValue(l, 1), 3.4);
 }
 
-
 TEST(DenseLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
@@ -182,7 +181,7 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
 
   std::unique_ptr<Leaf> leaf(new Leaf);
 
-  op->ExportModel( *stat, leaf.get());
+  op->ExportModel(*stat, leaf.get());
 
   // Make sure it was initialized correctly.
   EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1);
@@ -194,7 +193,6 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
   EXPECT_EQ(leaf->sparse_vector().sparse_value().size(), kNumClasses);
 }
 
-
 TEST(RegressionLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.h b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
index b0ed949424756cc498d4b7ad1fb1867fff11b265..7583e3d0402a3a1d07f3696727b285747dc887de 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
@@ -24,7 +24,6 @@ namespace tensorforest {
 // Return the value of the given depth-dependent parameter given a leaf's depth.
 float ResolveParam(const DepthDependentParam& param, int32 depth);
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
index 801881af1368dc33f00b356d12bea07ae3161ef6..4010a71006d58df0bec6d3686a9c47433b46fdd4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
@@ -71,5 +71,3 @@ TEST(ParamsTest, TestThreshold) {
 }
 
 }  // namespace
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
index cdb1d80a4bbd47d1481ecde2348bef500bd125f1..b7b60d0ab8c2670cec8b029d1f42c5edd3690afe 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
@@ -52,8 +52,8 @@ std::unique_ptr<GrowStats> SplitCollectionOperator::CreateGrowStats(
           new SparseClassificationGrowStats(params_, depth));
 
     case STATS_LEAST_SQUARES_REGRESSION:
-      return std::unique_ptr<GrowStats>(new LeastSquaresRegressionGrowStats(
-          params_, depth));
+      return std::unique_ptr<GrowStats>(
+          new LeastSquaresRegressionGrowStats(params_, depth));
 
     case STATS_FIXED_SIZE_SPARSE_GINI:
       return std::unique_ptr<GrowStats>(
@@ -136,8 +136,7 @@ void SplitCollectionOperator::CreateAndInitializeCandidateWithExample(
   stats_.at(node_id)->AddSplit(split, input_data, target, example);
 }
 
-bool SplitCollectionOperator::BestSplit(int32 node_id,
-                                        SplitCandidate* best,
+bool SplitCollectionOperator::BestSplit(int32 node_id, SplitCandidate* best,
                                         int32* depth) const {
   auto* slot = stats_.at(node_id).get();
   *depth = slot->depth();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
index ad52f89faddb15be77644b5dc374aca73c46b149..c606ff98c67f411a5817f0282238fdaf3be03642 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
@@ -71,9 +71,7 @@ class SplitCollectionOperator {
   }
 
   // Perform any necessary cleanup for any tracked state for the slot.
-  virtual void ClearSlot(int32 node_id) {
-    stats_.erase(node_id);
-  }
+  virtual void ClearSlot(int32 node_id) { stats_.erase(node_id); }
 
   // Return true if slot is fully initialized.
   virtual bool IsInitialized(int32 node_id) const;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
index 0bec198e97e8215d2cfdb9ada5355dd5b0d2d97b..c749fbe69e17769c2f2b69bcf541eb0eb8b9e7e8 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
@@ -32,9 +32,9 @@ namespace tensorforest {
 //   smoothed_sum = stats.sum() + #_classes
 float GiniImpurity(const LeafStat& stats, int32 num_classes) {
   const float smoothed_sum = num_classes + stats.weight_sum();
-  return 1.0 - (
-      (stats.classification().gini().square()
-       + 2 * stats.weight_sum() + num_classes) / (smoothed_sum * smoothed_sum));
+  return 1.0 - ((stats.classification().gini().square() +
+                 2 * stats.weight_sum() + num_classes) /
+                (smoothed_sum * smoothed_sum));
 }
 
 float WeightedGiniImpurity(const LeafStat& stats, int32 num_classes) {
@@ -46,21 +46,20 @@ void UpdateGini(LeafStat* stats, float old_val, float weight) {
   // Equivalent to stats->square() - old_val * old_val + new_val * new_val,
   // (for new_val = old_val + weight), but more numerically stable.
   stats->mutable_classification()->mutable_gini()->set_square(
-      stats->classification().gini().square()
-      + weight * weight + 2 * old_val * weight);
+      stats->classification().gini().square() + weight * weight +
+      2 * old_val * weight);
 }
 
-
 float Variance(const LeafStat& stats, int output) {
   if (stats.weight_sum() == 0) {
     return 0;
   }
   const float e_x =
-      stats.regression().mean_output().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output().value(output).float_value() /
+      stats.weight_sum();
   const auto e_x2 =
-      stats.regression().mean_output_squares().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output_squares().value(output).float_value() /
+      stats.weight_sum();
   return e_x2 - e_x * e_x;
 }
 
@@ -75,8 +74,7 @@ float TotalVariance(const LeafStat& stats) {
 float SmoothedGini(float sum, float square, int num_classes) {
   // See comments for GiniImpurity above.
   const float smoothed_sum = num_classes + sum;
-  return 1.0 -
-         (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
+  return 1.0 - (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
 }
 
 float WeightedSmoothedGini(float sum, float square, int num_classes) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
index 289c81e9d51dbc5d2023f7eabce8c2089748645d..38deb3e3cd816aae5fe66f26cd4b934316d03ce4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
@@ -27,9 +27,7 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
       : StoredInputTarget(new std::vector<float>(t), new std::vector<float>(w),
                           num_t) {}
 
-  int NumItems() const {
-    return target_->size();
-  }
+  int NumItems() const { return target_->size(); }
 
   int32 GetTargetAsClassIndex(int example_index,
                               int target_index) const override {
@@ -51,7 +49,6 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
   }
 };
 
-
 class TestableDataSet : public TensorDataSet {
  public:
   TestableDataSet(const std::vector<float>& data, int num_features)
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 6ff5a9e2b18ead9ea9f77f796b91b05d9b895489..4175d8adb58a85728519042a9870e8c4590232ba 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -40,7 +40,6 @@ cc_library(
     hdrs = ["summary_db_writer.h"],
     copts = tf_copts(),
     deps = [
-        ":schema",
         ":summary_converter",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..28f571e1f015435b33406d0b802bb8e059c6e5ee
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -0,0 +1,45 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_cuda_cc_test(
+    name = "tensorrt_test_cc",
+    size = "small",
+    srcs = ["tensorrt_test.cc"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e11522ea5bda7f5a303d6ea332148dbd7b17f162
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace {
+
+class Logger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+class ScopedWeights {
+ public:
+  ScopedWeights(float value) : value_(value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = &value_;
+    w.count = 1;
+  }
+  const nvinfer1::Weights& get() { return w; }
+
+ private:
+  float value_;
+  nvinfer1::Weights w;
+};
+
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+
+// Creates a network to compute y=2x+3.
+nvinfer1::IHostMemory* CreateNetwork() {
+  Logger logger;
+  nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
+  ScopedWeights weights(2.0);
+  ScopedWeights bias(3.0);
+
+  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  // Add the input.
+  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::DimsCHW{1, 1, 1});
+  EXPECT_NE(input, nullptr);
+  // Add the hidden layer.
+  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  EXPECT_NE(layer, nullptr);
+  // Mark the output.
+  auto output = layer->getOutput(0);
+  output->setName(kOutputTensor);
+  network->markOutput(*output);
+  // Build the engine
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildCudaEngine(*network);
+  EXPECT_NE(engine, nullptr);
+  // Serialize the engine to create a model, then close everything.
+  nvinfer1::IHostMemory* model = engine->serialize();
+  network->destroy();
+  engine->destroy();
+  builder->destroy();
+  return model;
+}
+
+// Executes the network.
+void Execute(nvinfer1::IExecutionContext& context, const float* input,
+             float* output) {
+  const nvinfer1::ICudaEngine& engine = context.getEngine();
+
+  // We have two bindings: input and output.
+  ASSERT_EQ(engine.getNbBindings(), 2);
+  const int input_index = engine.getBindingIndex(kInputTensor);
+  const int output_index = engine.getBindingIndex(kOutputTensor);
+
+  // Create GPU buffers and a stream
+  void* buffers[2];
+  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+
+  // Copy the input to the GPU, execute the network, and copy the output back.
+  //
+  // Note that since the host buffer was not created as pinned memory, these
+  // async copies are turned into sync copies. So the following synchronization
+  // could be removed.
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  context.enqueue(1, buffers, stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+
+  // Release the stream and the buffers
+  cudaStreamDestroy(stream);
+  ASSERT_EQ(0, cudaFree(buffers[input_index]));
+  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+
+TEST(TensorrtTest, BasicFunctions) {
+  // Create the network model.
+  nvinfer1::IHostMemory* model = CreateNetwork();
+  // Use the model to create an engine and then an execution context.
+  Logger logger;
+  nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
+  nvinfer1::ICudaEngine* engine =
+      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+  model->destroy();
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+
+  // Execute the network.
+  float input = 1234;
+  float output;
+  Execute(*context, &input, &output);
+  EXPECT_EQ(output, input * 2 + 3);
+
+  // Destroy the engine.
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index c7193cef6915f9d0caf5b52fc084129cbc736994..c834430b95d37f7c313c54c086b9c323cf3e921f 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from os import path
 
 import numpy
@@ -80,18 +81,19 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       input_statistics: A math_utils.InputStatistics object.
     """
     super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics)
-    self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
-    # Create templates so we don't have to worry about variable reuse.
-    self._lstm_cell_run = tf.make_template(
-        name_="lstm_cell",
-        func_=self._lstm_cell,
-        create_scope_now_=True)
-    # Transforms LSTM output into mean predictions.
-    self._predict_from_lstm_output = tf.make_template(
-        name_="predict_from_lstm_output",
-        func_=
-        lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features),
-        create_scope_now_=True)
+    with tf.variable_scope("", use_resource=True):
+      # Use ResourceVariables to avoid race conditions.
+      self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
+      # Create templates so we don't have to worry about variable reuse.
+      self._lstm_cell_run = tf.make_template(
+          name_="lstm_cell",
+          func_=self._lstm_cell,
+          create_scope_now_=True)
+      # Transforms LSTM output into mean predictions.
+      self._predict_from_lstm_output = tf.make_template(
+          name_="predict_from_lstm_output",
+          func_=functools.partial(tf.layers.dense, units=self.num_features),
+          create_scope_now_=True)
 
   def get_start_state(self):
     """Return initial state for the time series model."""
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 0199313bc8d0214a547498b97e9a1d83ee37b708..a7d54d8a0cc4598c26d1c7bd62f5b0aa1070701b 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -43,6 +43,7 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
+        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 1cded9f8cf01b931d1d535a54effd54459dd8e9a..b1ef9fde37fe0647965f0818895be37d2d56d207 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -47,22 +48,40 @@ string GetCurrentTimeStampAsString() {
   return s;
 }
 
-ProfileResponse Profile(const string& service_addr, int duration_ms) {
+Status ValidateHostPortPair(const string& host_port) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos || parts[0].empty()) {
+    return errors::InvalidArgument("Could not interpret \"", host_port,
+                                   "\" as a host-port pair.");
+  }
+  return Status::OK();
+}
+
+ProfileResponse Profile(const string& service_addr, int duration_ms,
+                        const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
   request.add_tools("input_pipeline");
   request.add_tools("overview_page");
+  *request.mutable_opts() = opts;
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
   channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
                       std::numeric_limits<int32>::max());
   std::unique_ptr<TPUProfiler::Stub> stub =
       TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-          service_addr, ::grpc::InsecureChannelCredentials(), channel_args));
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
   return response;
@@ -76,13 +95,21 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
   int FLAGS_duration_ms = 2000;
+  int FLAGS_num_tracing_attempts = 3;
+  bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
       tensorflow::Flag("logdir", &FLAGS_logdir,
-                       "Path of TensorBoard log directory e.g. /tmp/tb_log"),
+                       "Path of TensorBoard log directory e.g. /tmp/tb_log, "
+                       "gs://tb_bucket"),
       tensorflow::Flag("duration_ms", &FLAGS_duration_ms,
                        "Duration of tracing in ms. Default is 2000ms."),
+      tensorflow::Flag("num_tracing_attempts", &FLAGS_num_tracing_attempts,
+                       "Automatically retry N times when no trace event "
+                       "is collected. Default is 3."),
+      tensorflow::Flag("include_dataset_ops", &FLAGS_include_dataset_ops,
+                       "Set to false to profile longer TPU device traces."),
   };
 
   std::cout << "Welcome to the Cloud TPU Profiler v" << TPU_PROFILER_VERSION
@@ -91,14 +118,46 @@ int main(int argc, char** argv) {
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) {
-    std::printf("%s", usage.c_str());
+    std::cout << usage.c_str() << std::endl;
+    return 2;
+  }
+  tensorflow::Status status =
+      tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr);
+  if (!status.ok()) {
+    std::cout << status.error_message() << std::endl;
+    std::cout << usage.c_str() << std::endl;
     return 2;
   }
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  int duration_ms = FLAGS_duration_ms;
-  tensorflow::ProfileResponse response =
-      tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms);
+  // Sets the minimum duration_ms and tracing attempts to one.
+  int duration_ms = std::max(FLAGS_duration_ms, 1);
+  int remaining_attempts = std::max(FLAGS_num_tracing_attempts, 1);
+  tensorflow::ProfileOptions opts;
+  opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
+  tensorflow::ProfileResponse response;
+
+  while (true) {
+    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
+    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
+    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    std::cout << "No trace event is collected. Automatically retrying."
+              << std::endl
+              << std::endl;
+  }
+
+  if (response.encoded_trace().empty()) {
+    std::cout << "No trace event is collected after "
+              << FLAGS_num_tracing_attempts << " attempt(s). "
+              << "Perhaps, you want to try again (with more attempts?)."
+              << std::endl
+              << "Tip: increase number of attempts with --num_tracing_attempts."
+              << std::endl;
+    // Don't dump profile data if no trace is collected.
+    return 0;
+  }
+
   // Use the current timestamp as the run name.
   tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
   TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index b842951eb2c22792a22d9a16c022d3122391f4e8..ebd6185faad28ae7a22eb33f6b358eb2344c9c22 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -151,10 +151,7 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
-  const bool empty_trace = response.encoded_trace().empty();
-  if (empty_trace) {
-    *os << "No trace event is collected." << std::endl;
-  } else {
+  if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
     TF_RETURN_IF_ERROR(
         DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
@@ -165,11 +162,9 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
     TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
                                                    response.op_profile(), os));
   }
-  if (!empty_trace && !response.tool_data().empty()) {
-    for (const auto& tool_data : response.tool_data()) {
-      TF_RETURN_IF_ERROR(
-          DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
-    }
+  for (const auto& tool_data : response.tool_data()) {
+    TF_RETURN_IF_ERROR(
+        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
   }
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 25b958bcfeab7e0cfd9c180b8af4057e9bdfc73b..29ef977bacfd61e163be49558c5b94277ed479c1 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -27,7 +27,10 @@ namespace tpu {
 // The following tools are supported:
 //   - Trace viewer
 //   - Op profile
-//   - HLO computation graph
+//   - Input pipeline analyzer
+//   - Overview page
+// Note: this function creates a directory even when all fields in
+// ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 846db1332991e8c84f51dc7e6bcc3592a955991e..78d237e6a201541b6095b101311db48b447cc477 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -17,6 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from absl import flags
 
 import os
 import subprocess
@@ -24,13 +25,21 @@ import sys
 
 import tensorflow as tf
 
-tf.flags.DEFINE_string('service_addr', '',
-                       'Address of TPU profiler service e.g. localhost:8466')
-tf.flags.DEFINE_string('logdir', '',
-                       'Path of TensorBoard log directory e.g. /tmp/tb_log')
-tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_string(
+    'service_addr', None, 'Address of TPU profiler service e.g. '
+    'localhost:8466')
+flags.DEFINE_string(
+    'logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, '
+    'gs://tb_bucket')
+flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_integer(
+    'num_tracing_attempts', 3, 'Automatically retry N times when no trace '
+    'event is collected.')
+flags.DEFINE_boolean(
+    'include_dataset_ops', True, 'Set to false to profile longer TPU '
+    'device traces.')
 
-FLAGS = tf.flags.FLAGS
+FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
 
 
@@ -47,6 +56,8 @@ def main(unused_argv=None):
   cmd.append('--logdir='+logdir)
   cmd.append('--service_addr='+FLAGS.service_addr)
   cmd.append('--duration_ms='+str(FLAGS.duration_ms))
+  cmd.append('--num_tracing_attempts='+str(FLAGS.num_tracing_attempts))
+  cmd.append('--include_dataset_ops='+str(FLAGS.include_dataset_ops).lower())
   subprocess.call(cmd)
 
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 92196638318f4a551619d04ba730ac66a58d596e..cb6198479908943a546710b94f059d27d9e41a84 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.4.3-a2'
+_VERSION = '1.6.0-rc0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 5440bbbfdd75207bd209c19d5cc42dc69504d39b..2094294baad63ae73712c8648b588accd4551ef8 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -61,6 +61,11 @@ message OpMetricsResult {
 message OpMetricsDbResult {
   // A bunch of OpMetricsResults.
   repeated OpMetricsResult metrics_db = 1;
+  // The total host infeed-enqueue duration in picoseconds.
+  optional uint64 total_host_infeed_enq_duration_ps = 2;
+  // The total of the difference between the start times of two
+  // consecutive infeed-enqueues (per host) in picoseconds.
+  optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
 }
 
 // Result proto for StepInfo.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index bf30d2ce091302eaf361a0018464d3b7de94ea6d..f3f3302ceb3d27dbb21bdce753aeb2d7fcd77448 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -13,6 +13,14 @@ service TPUProfiler {
   }
 }
 
+message ProfileOptions {
+  // We don't collect the dataset ops by default for better trace-viewer
+  // scalability. The caller can mannually set this field to include the ops.
+  bool include_dataset_ops = 1;
+
+  // next-field: 2
+}
+
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -25,10 +33,13 @@ message ProfileRequest {
   // required profiling tools name such as "input_pipeline_analyzer" etc
   repeated string tools = 3;
 
+  // Optional profiling options that control how a TF session will be profiled.
+  ProfileOptions opts = 4;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 4
+  // next-field: 5
 }
 
 message ProfileToolData {
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index 0f645a549296b0f05acfb7ae564be1daf37925f8..dc6a934891138018d32d511750120453bdf290cf 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.4.3"
+#define TPU_PROFILER_VERSION "1.5.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index a49a3dcf2999053d9b0d5ffcb6411e693702d785..1c970655d0e464912d9b0a514fca0523bc604726 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -47,7 +47,7 @@ if platform.system() != "Windows":
   # types are supported.
 
   _SUPPORTED_INFEED_DTYPES = set([
-      dtypes.int32, dtypes.bfloat16, dtypes.float32
+      dtypes.bool, dtypes.int32, dtypes.bfloat16, dtypes.float32
   ])
 
   def infeed_dequeue(dtype, shape, name=None):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 8fec379aad8a90d06cd05f4858d25656384a12b2..d5f54ff4fd278f0c84f79e0079bfb7a409dfba8d 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -153,10 +153,11 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
           "(operator name: %s)" % op.name)
-    # pylint: enable=protected-access
     if _TPU_REPLICATE_ATTR in op.node_def.attr:
       raise ValueError("TPU computations cannot be nested")
-    op.node_def.attr[_TPU_REPLICATE_ATTR].s = compat.as_bytes(self._name)
+    op._set_attr(_TPU_REPLICATE_ATTR,
+                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
+    # pylint: enable=protected-access
     op.graph.prevent_feeding(op)
     op.graph.prevent_fetching(op)
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 0c2580211ab7674d841ca1953c9327df9488bb8e..188db6e2f0d12ed441c043674df9e2e6bec7cc14 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -53,7 +53,8 @@ class TPUConfig(
     num_shards: The number of TPU shards in the system.
     per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
       rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
-      is invoked once on each host. To be precise, with a global batch size
+      is invoked once on each host. With Per-Core input pipeline deployment, it
+      is invoked once for each core. To be precise, with a global batch size
       `train_batch_size` in `TPUEstimator` constructor, the batch size for each
       shard is `train_batch_size` // #hosts. With Per-Core input pipeline
       deployment, the shard batch size is `train_batch_size` // #cores.
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index b6d685b3fca22a14c6f97d2d3b7c5668ebf4e297..c7008533f3a776dab54fcb16ffffabb48481f43a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-
 """TPUEstimator class."""
 
 from __future__ import absolute_import
@@ -22,13 +21,16 @@ from __future__ import print_function
 import collections
 from contextlib import contextmanager
 import copy
+import signal
 import threading
 import time
+import traceback
 
 import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -39,6 +41,7 @@ from tensorflow.contrib.tpu.python.tpu import util as util_lib
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util
@@ -60,7 +63,6 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 
-
 _INITIAL_LOSS = 1e7
 _ZERO_LOSS = 0.
 _TPU_ESTIMATOR = 'tpu_estimator'
@@ -69,7 +71,12 @@ _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
-# TODO(b/65703635): Flip the value and remove all dead code.
+
+# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
+# only used for per-core based deployments. For per-host based pipelines, if a
+# user returns a Dataset instance it will be automatically wrapped in a
+# tf.while_loop (This can be disabled by returning features and labels
+# explicitly).
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
@@ -86,8 +93,7 @@ def _create_global_step(graph):
         initializer=init_ops.zeros_initializer(),
         trainable=False,
         use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES,
-                     ops.GraphKeys.GLOBAL_STEP])
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
 
 
 def _create_or_get_iterations_per_loop():
@@ -100,8 +106,8 @@ def _create_or_get_iterations_per_loop():
     raise RuntimeError('Multiple iterations_per_loop_var in collection.')
 
   with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(_TPU_ESTIMATOR,
-                                       reuse=variable_scope.AUTO_REUSE):
+    with variable_scope.variable_scope(
+        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
       return variable_scope.get_variable(
           _ITERATIONS_PER_LOOP_VAR,
           initializer=init_ops.zeros_initializer(),
@@ -215,16 +221,11 @@ class _TPUContext(object):
   def is_running_on_cpu(self):
     """Determines whether the input_fn and model_fn should be invoked on CPU."""
     mode = self._assert_mode()
-    return ((not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT or
-            (mode == model_fn_lib.ModeKeys.EVAL and
-             self._eval_batch_size is None))
+    return (not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT
 
   @property
   def global_batch_size(self):
     mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      raise RuntimeError('Internal error, EVAL on TPU is not enabled, but '
-                         '`global_batch_size` is called.')
     return (self._train_batch_size
             if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
 
@@ -232,9 +233,6 @@ class _TPUContext(object):
   def batch_size_for_input_fn(self):
     """Returns the shard batch size for `input_fn`."""
     mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
     if self.is_running_on_cpu():
       if mode == model_fn_lib.ModeKeys.TRAIN:
         return self._train_batch_size
@@ -242,9 +240,9 @@ class _TPUContext(object):
         return self._eval_batch_size
       return None
 
-    global_batch_size = (self._train_batch_size if
-                         mode == model_fn_lib.ModeKeys.TRAIN
-                         else self._eval_batch_size)
+    global_batch_size = (
+        self._train_batch_size
+        if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
     # On TPU
     if self.is_input_sharded_per_core():
       return global_batch_size // self.num_cores
@@ -255,9 +253,6 @@ class _TPUContext(object):
   def batch_size_for_model_fn(self):
     """Returns the shard batch size for `model_fn`."""
     mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
     if self.is_running_on_cpu():
       if mode == model_fn_lib.ModeKeys.TRAIN:
         return self._train_batch_size
@@ -291,8 +286,9 @@ class _TPUContext(object):
     # The tpu job is determined by the run_config. Right now, this method is
     # required as tpu_config is not part of the RunConfig.
     mode = self._assert_mode()
-    master = (run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL
-              else run_config.master)
+    master = (
+        run_config.evaluation_master
+        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
     if master in _LOCAL_MASTERS:
       return None
 
@@ -319,6 +315,7 @@ class _TPUContext(object):
   def tpu_host_placement_function(self):
     """Returns the TPU host place function."""
     master = self.master_job
+
     def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
       assert _sentinal is None
       if core_id is not None and host_id is not None:
@@ -333,19 +330,23 @@ class _TPUContext(object):
         if core_id is not None:
           host_id = core_id / 8
         return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
+
     return _placement_function
 
   @property
   def tpu_device_placement_function(self):
     master = self.master_job
     job_device = '' if master is None else ('/job:%s' % master)
+
     def _placement_function(i):
       return '%s/task:%d/device:TPU:%d' % (job_device, i / 8, i % 8)
+
     return _placement_function
 
   @property
   def tpu_ordinal_function(self):
     """Returns the TPU ordinal fn."""
+
     def _tpu_ordinal_function(index):
       """Return the TPU ordinal associated with a shard.
 
@@ -358,6 +359,7 @@ class _TPUContext(object):
         The ordinal of the TPU device the shard's infeed should be placed on.
       """
       return index % 8
+
     return _tpu_ordinal_function
 
 
@@ -371,14 +373,17 @@ class _SIGNAL(object):
   STOP = -2
 
 
-class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
-    'mode',
-    'predictions',
-    'loss',
-    'train_op',
-    'eval_metrics',
-    'export_outputs',
-    'scaffold_fn'])):
+class TPUEstimatorSpec(
+    collections.namedtuple('TPUEstimatorSpec', [
+        'mode',
+        'predictions',
+        'loss',
+        'train_op',
+        'eval_metrics',
+        'export_outputs',
+        'scaffold_fn',
+        'host_call'
+    ])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   See `EstimatorSpec` for `mode`, 'predictions, 'loss', 'train_op', and
@@ -403,6 +408,15 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
 
   `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
   function should not capture any Tensors in `model_fn`.
+
+  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
+  to pass to that function and returns a list of Tensors. `host_call` currently
+  works for train() and evaluate(). The Tensors returned by the function is
+  executed on the CPU on every step, so there is communication overhead when
+  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
+  size of the tensors. The `tensors` are concatenated along their major (batch)
+  dimension, and so must be >= rank 1. The `host_call` is useful for writing
+  summaries with @{tf.contrib.summary.create_file_writer}.
   """
 
   def __new__(cls,
@@ -412,115 +426,154 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
               train_op=None,
               eval_metrics=None,
               export_outputs=None,
-              scaffold_fn=None):
+              scaffold_fn=None,
+              host_call=None):
     """Creates a validated `TPUEstimatorSpec` instance."""
+    host_calls = {}
     if eval_metrics is not None:
-      _EvalMetrics.validate(eval_metrics)
-    return super(TPUEstimatorSpec, cls).__new__(cls,
-                                                mode=mode,
-                                                predictions=predictions,
-                                                loss=loss,
-                                                train_op=train_op,
-                                                eval_metrics=eval_metrics,
-                                                export_outputs=export_outputs,
-                                                scaffold_fn=scaffold_fn)
+      host_calls['eval_metrics'] = eval_metrics
+    if host_call is not None:
+      host_calls['host_call'] = host_call
+    _OutfeedHostCall.validate(host_calls)
+    return super(TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn,
+        host_call=host_call)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    eval_metric_ops = _EvalMetrics.to_metric_metric_ops_for_cpu(
-        self.eval_metrics)
+    host_calls = {}
+    if self.eval_metrics is not None:
+      host_calls['eval_metrics'] = self.eval_metrics
+    if self.host_call is not None:
+      host_calls['host_call'] = self.host_call
+    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
+    eval_metric_ops = None
+    if self.eval_metrics is not None:
+      eval_metric_ops = host_call_ret['eval_metrics']
+    hooks = None
+    if self.host_call is not None:
+      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
-    return model_fn_lib.EstimatorSpec(mode=self.mode,
-                                      predictions=self.predictions,
-                                      loss=self.loss,
-                                      train_op=self.train_op,
-                                      eval_metric_ops=eval_metric_ops,
-                                      export_outputs=self.export_outputs,
-                                      scaffold=scaffold)
+    return model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=self.loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        scaffold=scaffold,
+        training_hooks=hooks,
+        evaluation_hooks=hooks,
+        prediction_hooks=hooks)
+
+
+class _OpQueueContext(object):
+  """Manages work queue and thread for a infeed/outfeed thread."""
+
+  def __init__(self, name, target, args):
+    self._name = name
+    self._queue = Queue.Queue()
+    args = (self,) + args
+    self._thread = threading.Thread(name=name, target=target, args=args)
+    self._thread.daemon = True
+    self._thread.start()
+
+  def stop(self):
+    self._queue.put(_SIGNAL.STOP)
+
+  def send_next_batch_signal(self, iterations):
+    self._queue.put(iterations)
+
+  def read_iteration_counts(self):
+    while True:
+      iterations = self._queue.get(block=True)
+      logging.debug('%s read iterations %s', self._name, iterations)
+      if iterations == _SIGNAL.STOP:
+        logging.info('%s received shutdown signal, stopping.', self._name)
+        return
+      yield iterations
+
+  def join(self):
+    logging.info('Shutting down %s thread.' % self._name)
+    self.stop()
+    self._thread.join()
+
 
+class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
+  """A Session hook setting up the TPU initialization, infeed, and outfeed.
+
+  This hook does two major things:
+  1. initialize and shutdown TPU system.
+  2. launch and join the threads for infeed enqueue and (optional) outfeed
+     dequeue.
+  """
 
-class _InfeedOutfeedThreadBaseController(object):
-  """This wraps the infeed/outfeed thread and stops when Estimator finishes."""
+  def __init__(self,
+               ctx,
+               enqueue_ops,
+               dequeue_ops,
+               run_infeed_loop_on_coordinator=True):
+    self._master_job = ctx.master_job
+    self._enqueue_ops = enqueue_ops
+    self._dequeue_ops = dequeue_ops
 
-  def __init__(self, thd):
-    self._signal_queue = Queue.Queue()
-    thd.daemon = True
-    thd.start()
-    self._thd = thd
+    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
+    self._initial_infeed_sleep_secs = (
+        ctx.config.tpu_config.initial_infeed_sleep_secs)
 
-  def block_and_get_signal(self):
-    return self._signal_queue.get()
+    self._session_cancel_timer = None
 
-  def send_next_batch_signal(self, signal=_SIGNAL.NEXT_BATCH):
-    self._signal_queue.put(signal)
+    self._feed_error = None
+    self._finished = False
 
-  def join(self):
-    self._signal_queue.put(_SIGNAL.STOP)
-    self._thd.join()
+  def begin(self):
+    logging.info('TPU job name %s', self._master_job)
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    self._init_ops = [tpu.initialize_system(job=self._master_job)]
+    self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
 
+    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
+    self._init_ops.extend(summary_writer_init_ops)
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    for op in summary_writer_init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
 
-class _OutfeedThreadController(_InfeedOutfeedThreadBaseController):
-  """This wraps the outfeed thread and stops when Estimator finishes."""
+  def _log_error(self, session, error):
+    """Log an infeed or outfeed error.
 
-  def __init__(self, session, dequeue_ops):
-    super(_OutfeedThreadController, self).__init__(
-        threading.Thread(target=self._execute_dequeue_ops,
-                         args=(session, dequeue_ops)))
+    This logs a short error message immediately, and schedules a timer to
+    emit the full stack trace and error message after a short period of time.
+    If the main session has terminated by the time the timer triggers, we
+    assume the real source of the error was from the main session and avoid
+    emitting a stack trace for the infeed.
 
-  def _execute_dequeue_ops(self, session, dequeue_ops):
-    count = 0
-    while True:
-      signal = self.block_and_get_signal()
-      if signal == _SIGNAL.STOP:
-        logging.info('Stop outfeed thread.')
-        return
+    Args:
+      session: `tf.Session`, session to be terminated error: exception that
+        triggered logging.
+      error: the Exception to log.
+    """
+    logging.warning(
+        '\n\n'
+        'Error occurred during infeed/outfeed.  This may be due to a compile '
+        'error in the main session.  Waiting for a short time for the main '
+        'session to come back.\n\n%s', error)
 
-      iterations = signal
-      for i in range(iterations):
-        logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-        session.run(dequeue_ops)
-      count += 1
+    self._feed_error = traceback.format_exc()
 
-  def join(self):
-    logging.info('Waiting for Outfeed Thread to exit.')
-    super(_OutfeedThreadController, self).join()
-
-
-class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
-  """This wraps the infeed thread and stops when Estimator finishes."""
-
-  def __init__(self, session, enqueue_ops, initial_infeed_sleep_secs):
-    super(_InfeedThreadController, self).__init__(
-        threading.Thread(
-            target=self._input_thread_fn_for_loading,
-            args=(session, enqueue_ops, initial_infeed_sleep_secs)))
-
-  def _input_thread_fn_for_loading(self, session, enqueue_ops,
-                                   initial_infeed_sleep_secs):
-    count = 0
-    if initial_infeed_sleep_secs:
-      logging.info('Infeed thread sleeping for %d seconds.',
-                   initial_infeed_sleep_secs)
-      time.sleep(initial_infeed_sleep_secs)
-      logging.info('Infeed thread starting after sleep')
-    try:
-      while True:
-        signal = self._signal_queue.get()
-        if signal == _SIGNAL.STOP:
-          logging.info('Stop Infeed input thread.')
-          return
-
-        if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-          # Enqueue batches for next loop.
-          session.run(enqueue_ops)
-        else:
-          iterations = signal
-          for i in range(iterations):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(enqueue_ops)
-          count += 1
+    # If we've already encountered a feed error, don't schedule another
+    # cancellation op.
+    if self._session_cancel_timer:
+      return
 
-    except Exception:  # pylint: disable=broad-except
+    def _cancel_session():
       # Close the session to avoid the main thread from hanging. If input
       # pipeline triggers any error, the infeed thread dies but the main thread
       # for TPU computation waits for the infeed enqueue forever. Close the
@@ -535,80 +588,93 @@ class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
       # exception in the main thread, instead of the expected compile error.
       # User code that depends on having the proper exception type will
       # therefore be confused.
-      logging.error(
-          'Failed running infeed, closing session.\n'
-          'You may see an exception from your main session after this. '
-          'Sleep for 2 minutes before close Session from infeed thread to '
-          'allow the main thread returning an error first, if any.',
-          exc_info=1
-      )
-      time.sleep(120)
-      logging.error('Closing the failed session.')
-      session.close()
-
-  def join(self):
-    logging.info('Waiting for Infeed Thread to exit.')
-    super(_InfeedThreadController, self).join()
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
+      time.sleep(5)
+
+      # If the main session is still running, the infeed/outfeed errors are
+      # legitimate, and should be logged.
+      if not self._finished:
+        logging.error('Feed error: %s', self._feed_error)
+        logging.error('Closing session.  A RuntimeError should follow.')
+        session.close()
+
+    self._session_cancel_timer = threading.Thread(target=_cancel_session)
+    self._session_cancel_timer.daemon = True
+    self._session_cancel_timer.start()
+
+  def _run_infeed(self, queue_ctx, session):
+    logging.info('Starting infeed thread controller.')
+    if self._initial_infeed_sleep_secs:
+      logging.info('%s thread sleeping for %d seconds.', self._name,
+                   self._initial_infeed_sleep_secs)
+      time.sleep(self._initial_infeed_sleep_secs)
+      logging.info('%s thread starting after sleep', self._name)
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
-    self._initial_infeed_sleep_secs = (
-        ctx.config.tpu_config.initial_infeed_sleep_secs)
-
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_op = [tpu.initialize_system(job=self._master_job)]
-    self._finalize_op = [tpu.shutdown_system(job=self._master_job)]
+    try:
+      if self._run_infeed_loop_on_coordinator:
+        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+          for i in xrange(steps):
+            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
+            session.run(self._enqueue_ops)
+      else:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      logging.info('Infeed thread finished, shutting down.')
+    except Exception as e:  # pylint: disable=broad-except
+      self._log_error(session, e)
+
+  def _run_outfeed(self, queue_ctx, session):
+    logging.info('Starting outfeed thread controller.')
+    try:
+      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+        for i in xrange(steps):
+          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
+          session.run(self._dequeue_ops)
+      logging.info('Outfeed thread finished, shutting down.')
+    except Exception as e:  # pylint: disable=broad-except
+      self._log_error(session, e)
 
   def after_create_session(self, session, coord):
     logging.info('Init TPU system')
-    session.run(self._init_op,
-                options=config_pb2.RunOptions(timeout_in_ms=5*60*1000))
+    session.run(self._init_ops,
+                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
     logging.info('Start infeed thread controller')
-    self._infeed_thd_controller = _InfeedThreadController(
-        session, self._enqueue_ops, self._initial_infeed_sleep_secs)
+    self._infeed_controller = _OpQueueContext(
+        name='InfeedController', target=self._run_infeed, args=(session,))
 
-    if self._dequeue_ops is not None:
-      logging.info('Start outfeed thread controller')
-      self._outfeed_thd_controller = _OutfeedThreadController(
-          session, self._dequeue_ops)
+    logging.info('Start outfeed thread controller')
+    self._outfeed_controller = _OpQueueContext(
+        name='OutfeedController', target=self._run_outfeed, args=(session,))
 
   def before_run(self, run_context):
+    if self._feed_error:
+      logging.warning('Feed error occurred, terminating session.')
+      run_context.request_stop()
+      return
+
     iterations = run_context.session.run(self._iterations_per_loop_var)
 
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+    self._infeed_controller.send_next_batch_signal(iterations)
 
-    self._infeed_thd_controller.send_next_batch_signal(iterations)
-    if self._dequeue_ops is not None:
-      # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
-      logging.info(
-          'Dequeue next (%d) batch(es) of data from outfeed.', iterations)
-      self._outfeed_thd_controller.send_next_batch_signal(iterations)
+    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                 iterations)
+    self._outfeed_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
+    if self._session_cancel_timer:
+      logging.warning('Feed error occurred; waiting for message.')
+      self._session_cancel_timer.join()
+
+    self._finished = True
     logging.info('Stop infeed thread controller')
-    self._infeed_thd_controller.join()
+    self._infeed_controller.join()
 
-    if self._dequeue_ops is not None:
-      logging.info('Stop output thread controller')
-      self._outfeed_thd_controller.join()
+    logging.info('Stop output thread controller')
+    self._outfeed_controller.join()
 
     logging.info('Shutdown TPU system.')
-    session.run(self._finalize_op)
+    session.run(self._finalize_ops)
 
 
 class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
@@ -676,8 +742,8 @@ class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
       run_context.request_stop()
     else:
       iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(iterations,
-                                         session=run_context.session)
+      self._iterations_per_loop_var.load(
+          iterations, session=run_context.session)
 
 
 class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
@@ -698,8 +764,8 @@ class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
     self._iterations_per_loop_var.load(self._num_steps, session=session)
 
 
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder):
+def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
+                                              inputs_structure_recorder):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
@@ -709,11 +775,14 @@ def generate_per_core_enqueue_ops_fn_for_host(
     per_host_sharded_inputs = []
     for core_ordinal in range(num_cores_per_host):
       with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        inputs = input_fn()
-        if isinstance(inputs, tuple):
-          features, labels = inputs
-        else:
-          features, labels = inputs, None
+        inputs = _Inputs.from_input_fn(input_fn())
+        if inputs.is_dataset:
+          raise TypeError(
+              '`input_fn` returning `Dataset`  is not yet supported in '
+              'per-Core input pipeline deployment yet. Please set '
+              'TPUConfig.per_host_input_for_training to True or return '
+              '`features` and `labels` from `input_fn`')
+        features, labels = inputs.features_and_labels()
 
         inputs_structure_recorder.validate_and_record_structure(
             features, labels)
@@ -729,9 +798,9 @@ def generate_per_core_enqueue_ops_fn_for_host(
         per_host_sharded_inputs)
 
     per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs,
-        tpu_ordinal_function=ctx.tpu_ordinal_function)
+        per_host_sharded_inputs, tpu_ordinal_function=ctx.tpu_ordinal_function)
     return per_host_enqueue_ops
+
   return enqueue_ops_fn, captured_infeed_queue
 
 
@@ -740,16 +809,24 @@ def generate_per_host_enqueue_ops_fn_for_host(
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
+  hooks = []
+
+  with ops.device(device):
+    inputs = _Inputs.from_input_fn(input_fn())
+
+    is_dataset = inputs.is_dataset
+    if is_dataset:
+      hooks.append(inputs.dataset_initializer_hook())
+
   def enqueue_ops_fn():
     with ops.device(device):
       num_cores_per_host = ctx.num_of_cores_per_host
-      inputs = input_fn()
-      if isinstance(inputs, tuple):
-        features, labels = inputs
-      else:
-        features, labels = inputs, None
-      inputs_structure_recorder.validate_and_record_structure(
-          features, labels)
+      # Convert user input to features and labels.  If the user returns a
+      # dataset, it is initialized and the features and labels extracted via
+      # `dataset.iterator.get_next()`
+      features, labels = inputs.features_and_labels()
+
+      inputs_structure_recorder.validate_and_record_structure(features, labels)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
               features, labels))
@@ -763,10 +840,10 @@ def generate_per_host_enqueue_ops_fn_for_host(
 
       per_host_enqueue_ops = (
           infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device))
+              unsharded_tensor_list, placement_function=lambda x: device))
       return per_host_enqueue_ops
-  return enqueue_ops_fn, captured_infeed_queue
+
+  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
 
 class _InputPipeline(object):
@@ -815,6 +892,7 @@ class _InputPipeline(object):
 
     def validate_and_record_structure(self, features, labels):
       """Validates and records the structure of features` and `labels`."""
+
       def _extract_key_names(tensor_or_dict):
         if tensor_or_dict is None:
           return []
@@ -842,8 +920,8 @@ class _InputPipeline(object):
       flattened_inputs = []
       if self._feature_names:
         # We need a fixed ordering for enqueueing and dequeueing.
-        flattened_inputs.extend([features[name]
-                                 for name in self._feature_names])
+        flattened_inputs.extend(
+            [features[name] for name in self._feature_names])
       else:
         flattened_inputs.append(features)
 
@@ -870,11 +948,11 @@ class _InputPipeline(object):
         ValueError: If the number of expected tensors from `flattened_inputs`
           mismatches the recorded structure.
       """
-      expected_num_features = (len(self._feature_names) if self._feature_names
-                               else 1)
+      expected_num_features = (
+          len(self._feature_names) if self._feature_names else 1)
       if self._has_labels:
-        expected_num_labels = (len(self._label_names) if self._label_names
-                               else 1)
+        expected_num_labels = (
+            len(self._label_names) if self._label_names else 1)
       else:
         expected_num_labels = 0
 
@@ -895,13 +973,13 @@ class _InputPipeline(object):
       if expected_num_labels == 0:
         unflattened_label = None
       elif self._label_names:
-        unflattened_label = dict(zip(self._label_names,
-                                     flattened_inputs[expected_num_features:]))
+        unflattened_label = dict(
+            zip(self._label_names, flattened_inputs[expected_num_features:]))
       else:
         # Single tensor case.
         unflattened_label = flattened_inputs[expected_num_features]
 
-      return unflattened_features, unflattened_label
+      return _Inputs(unflattened_features, unflattened_label)
 
   def __init__(self, input_fn, batch_axis, ctx):
     """Constructor.
@@ -929,7 +1007,8 @@ class _InputPipeline(object):
     # While tf.while_loop is called, the body function, which invokes
     # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
     # structure is recorded.
-    enqueue_ops = self._invoke_input_fn_and_record_structure()
+    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
+        self._invoke_input_fn_and_record_structure())
 
     self._validate_input_pipeline()
 
@@ -940,14 +1019,18 @@ class _InputPipeline(object):
       return self._inputs_structure_recorder.unflatten_features_and_labels(
           values)
 
-    return (enqueue_ops, dequeue_fn)
+    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
 
   def _invoke_input_fn_and_record_structure(self):
     """Deploys the input pipeline and record input structure."""
     enqueue_ops = []
     infeed_queues = []
+    all_hooks = []
     num_hosts = self._ctx.num_hosts
     tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+
+    run_infeed_loop_on_coordinator = True
+
     if self._sharded_per_core:
       # Per-Core input pipeline deployment.
       # Invoke input pipeline for each core and placed on the corresponding
@@ -961,8 +1044,10 @@ class _InputPipeline(object):
                     self._ctx, self._input_fn, self._inputs_structure_recorder))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              enqueue_ops.append(_wrap_computation_in_while_loop(
-                  device=host_device, op_fn=enqueue_ops_fn))
+              run_infeed_loop_on_coordinator = False
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
             # Infeed_queue_getter must be called after enqueue_ops_fn is called.
@@ -973,14 +1058,29 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
+            enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
                 generate_per_host_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder,
                     self._batch_axis, host_device))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              enqueue_ops.append(_wrap_computation_in_while_loop(
-                  device=host_device, op_fn=enqueue_ops_fn))
+            all_hooks.extend(hooks)
+
+            # NOTE(xiejw): We dispatch here based on the return type of the
+            # users `input_fn`.
+            #
+            # 1. If input_fn returns a Dataset instance, we initialize the
+            # iterator outside of tf.while_loop, and call the iterator.get_next
+            # inside tf.while_loop.  This should be always safe.
+            #
+            # 2. If input_fn returns (features, labels), it is too late to wrap
+            # them inside tf.while_loop, as resource initialization cannot be
+            # handled in TF control flow properly. In this case, we will use
+            # python loop to enqueue the data into TPU system.  This may be
+            # slow compared to the previous case.
+            if is_dataset:
+              run_infeed_loop_on_coordinator = False
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
             infeed_queues.append(captured_infeed_queue.get())
@@ -988,7 +1088,7 @@ class _InputPipeline(object):
     # dequeue is dtypes and types. So, any one can be used. Here, grab the
     # first one.
     self._infeed_queue = infeed_queues[0]
-    return enqueue_ops
+    return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator
 
   def _validate_input_pipeline(self):
     # Perform some sanity checks to log user friendly information. We should
@@ -1048,25 +1148,32 @@ class _ModelFnWrapper(object):
       A Fn representing the train step for TPU.
     """
 
+    host_call = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
 
     def train_step(loss):
       """Training step function for use inside a while loop."""
       del loss  # unused; required in function signature.
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       estimator_spec = self._verify_estimator_spec(
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
 
+      host_call_outfeed_ops = []
       if isinstance(estimator_spec, TPUEstimatorSpec):
         captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+        if estimator_spec.host_call is not None:
+          host_call.record({'host_call': estimator_spec.host_call})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
       else:
         captured_scaffold_fn.capture(None)
 
-      with ops.control_dependencies([train_op]):
+      with ops.control_dependencies([train_op] + host_call_outfeed_ops):
         return array_ops.identity(loss)
-    return train_step, captured_scaffold_fn
+
+    return train_step, host_call, captured_scaffold_fn
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single eval step on TPU.
@@ -1092,14 +1199,15 @@ class _ModelFnWrapper(object):
 
     Returns:
       A tuple of eval_fn and eval_metrics. The eval_fn representing the eval
-      step for TPU. and eval_metrics is an `_EvalMetrics` instance.
+      step for TPU. and eval_metrics is an `_OutfeedHostCall` instance.
     """
-    eval_metrics = _EvalMetrics(self._ctx)
+    host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
 
     def eval_step(total_loss):
       """Evaluation step function for use inside a while loop."""
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       tpu_estimator_spec = self._call_model_fn(features, labels)
       if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
@@ -1109,12 +1217,16 @@ class _ModelFnWrapper(object):
 
       loss = tpu_estimator_spec.loss
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      eval_metrics.record(tpu_estimator_spec)
-      outfeed_ops = tpu_ops.outfeed_enqueue_tuple(eval_metrics.outfeed_tensors)
+      to_record = {}
+      to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
 
-      with ops.control_dependencies([outfeed_ops]):
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
         return math_ops.add(total_loss, loss)
-    return eval_step, eval_metrics, captured_scaffold_fn
+
+    return eval_step, host_calls, captured_scaffold_fn
 
   def _call_model_fn(self, features, labels):
     """Calls the model_fn with required parameters."""
@@ -1138,10 +1250,9 @@ class _ModelFnWrapper(object):
       kwargs['params'] = params
 
     if 'params' not in model_fn_args:
-      raise ValueError(
-          'model_fn ({}) does not include params argument, '
-          'required by TPUEstimator to pass batch size as '
-          'params[\'batch_size\']'.format(self._model_fn))
+      raise ValueError('model_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params[\'batch_size\']'.format(self._model_fn))
 
     batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
     if batch_size_for_model_fn is not None:
@@ -1175,158 +1286,206 @@ class _ModelFnWrapper(object):
     return estimator_spec
 
 
-class _EvalMetrics(object):
-  """Class wraps TPUEstimator.eval_metrics."""
+class _OutfeedHostCall(object):
+  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
 
   def __init__(self, ctx):
     self._ctx = ctx
-    self._metric_fn = None
-    self._is_dict = False
-    self._tensor_keys = []
-    self._tensors = []
-    self._tensor_dtypes = []
-    self._tensor_shapes = []
-    self._recorded = False
+    self._names = []
+    # All of these are dictionaries of lists keyed on the name.
+    self._host_fns = {}
+    self._tensor_keys = collections.defaultdict(list)
+    self._tensors = collections.defaultdict(list)
+    self._tensor_dtypes = collections.defaultdict(list)
+    self._tensor_shapes = collections.defaultdict(list)
 
   @staticmethod
-  def validate(eval_metrics):
-    """Validates the `eval_metrics` in `TPUEstimatorSpec`."""
-
-    if not isinstance(eval_metrics, (tuple, list)):
-      raise ValueError('eval_metrics should be tuple or list')
-    if len(eval_metrics) != 2:
-      raise ValueError('eval_metrics should have two elements.')
-    if not callable(eval_metrics[0]):
-      raise TypeError('eval_metrics[0] should be callable.')
-    if not isinstance(eval_metrics[1], (tuple, list, dict)):
-      raise ValueError('eval_metrics[1] should be tuple or list, or dict.')
-
-    if isinstance(eval_metrics[1], (tuple, list)):
-      fn_args = util.fn_args(eval_metrics[0])
-      if len(eval_metrics[1]) != len(fn_args):
-        raise RuntimeError(
-            'In TPUEstimatorSpec.eval_metrics, length of tensors does not '
-            'match method args of metric_fn.')
+  def validate(host_calls):
+    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
+
+    for name, host_call in host_calls.items():
+      if not isinstance(host_call, (tuple, list)):
+        raise ValueError('{} should be tuple or list'.format(name))
+      if len(host_call) != 2:
+        raise ValueError('{} should have two elements.'.format(name))
+      if not callable(host_call[0]):
+        raise TypeError('{}[0] should be callable.'.format(name))
+      if not isinstance(host_call[1], (tuple, list, dict)):
+        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
+
+      if isinstance(host_call[1], (tuple, list)):
+        fn_args = util.fn_args(host_call[0])
+        if len(host_call[1]) != len(fn_args):
+          raise RuntimeError(
+              'In TPUEstimatorSpec.{}, length of tensors does not '
+              'match method args of metric_fn.'.format(name))
 
   @staticmethod
-  def to_metric_metric_ops_for_cpu(eval_metrics):
-    """Converts `TPUEstimatorSpec.eval_metrics` to `eval_metric_ops` for CPU."""
-    if not eval_metrics:
-      return None
-
-    _EvalMetrics.validate(eval_metrics)
-
-    metric_fn, tensors = eval_metrics
+  def create_cpu_hostcall(host_calls):
+    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
+
+    _OutfeedHostCall.validate(host_calls)
+    ret = {}
+    for name, host_call in host_calls.items():
+      host_fn, tensors = host_call
+      if isinstance(tensors, (tuple, list)):
+        ret[name] = host_fn(*tensors)
+      else:
+        # Must be dict.
+        try:
+          ret[name] = host_fn(**tensors)
+        except TypeError as e:
+          logging.warning(
+              'Exception while calling %s: %s. It is likely the tensors '
+              '(%s[1]) do not match the '
+              'function\'s arguments', name, e, name)
+          raise e
+    return ret
+
+  def record(self, host_calls):
+    """Records the host_call structure."""
+
+    for name, host_call in host_calls.items():
+      host_fn, tensor_list_or_dict = host_call
+      self._names.append(name)
+      self._host_fns[name] = host_fn
+
+      if isinstance(tensor_list_or_dict, dict):
+        for (key, tensor) in six.iteritems(tensor_list_or_dict):
+          self._tensor_keys[name].append(key)
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+      else:
+        # List or tuple.
+        self._tensor_keys[name] = None
+        for tensor in tensor_list_or_dict:
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
 
-    if isinstance(tensors, (tuple, list)):
-      return metric_fn(*tensors)
-    else:
-      # Must be dict.
-      try:
-        return metric_fn(**tensors)
-      except TypeError as e:
-        logging.warning(
-            'Exception while calling metric_fn for evalution: %s. '
-            'It is likely the tensors (eval_metrics[1]) do not match the '
-            'metric_fn arguments', e)
-        raise e
-
-  def record(self, spec):
-    """Records the eval_metrics structure in `spec`."""
-    if self._recorded:
-      raise RuntimeError('Eval metrics have been recorded already.')
-
-    self._metric_fn, tensor_list_or_dict = spec.eval_metrics
-
-    if isinstance(tensor_list_or_dict, dict):
-      self._is_dict = True
-      for (key, tensor) in six.iteritems(tensor_list_or_dict):
-        self._tensor_keys.append(key)
-        self._tensors.append(tensor)
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    else:
-      # List or tuple.
-      self._is_dict = False
-      self._tensors = tensor_list_or_dict
-      for tensor in tensor_list_or_dict:
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    self._recorded = True
+  def create_enqueue_op(self):
+    """Create the op to enqueue the recorded host_calls.
 
-  @property
-  def outfeed_tensors(self):
-    if not self._recorded:
-      raise RuntimeError('Eval metrics have not been recorded yet')
-    return self._tensors
+    Returns:
+      A list of enqueue ops, which is empty if there are no host calls.
+    """
+    if not self._names:
+      return []
 
-  def to_metric_metric_ops_for_tpu(self, dummy_update_op):
-    """Creates the eval_metric_ops now based on the TPU outfeed.
+    tensors = []
+    # TODO(jhseu): Consider deduping tensors.
+    for name in self._names:
+      tensors.extend(self._tensors[name])
+    return [tpu_ops.outfeed_enqueue_tuple(tensors)]
 
-    `eval_metric_ops` is defined in `EstimatorSpec`. From all shards, tensors
-    are dequeued from outfeed and then concatenated (along batch size dimension)
-    to form  global-like tensors. All global-like tensors are passed to the
-    metric fn.
+  def create_tpu_hostcall(self):
+    """Sends the tensors through outfeed and runs the host_fn on CPU.
 
-    Args:
-      dummy_update_op: A dummy update op.
+    The tensors are concatenated along dimension 0 to form a global tensor
+    across all shards. The concatenated function is passed to the host_fn and
+    executed on the first host.
 
     Returns:
-      A tuple of (`eval_metric_ops` and `update_ops`), where `update_ops` should
-      be invoked in Outfeed dequeue thread, which drive the outfeed dequeue and
-      update the state of metrics.
+      A dictionary mapping name to the return type of the host_call by that
+      name.
 
     Raises:
       RuntimeError: If outfeed tensor is scalar.
     """
+    if not self._names:
+      return []
 
-    num_cores = self._ctx.num_cores
-
+    ret = {}
     # For each i, dequeue_ops[i] is a list containing the tensors from all
     # shards. This list is concatenated later.
     dequeue_ops = []
-    for i in xrange(len(self._tensors)):
-      dequeue_ops.append([])
-
-    # Outfeed ops execute on each JF node.
+    tensor_dtypes = []
+    tensor_shapes = []
+    for name in self._names:
+      for _ in self._tensors[name]:
+        dequeue_ops.append([])
+      for dtype in self._tensor_dtypes[name]:
+        tensor_dtypes.append(dtype)
+      for shape in self._tensor_shapes[name]:
+        tensor_shapes.append(shape)
+
+    # Outfeed ops execute on each JF node. Note: we must constraint it such that
+    # we have at most one outfeed dequeue and enqueue.
     tpu_device_placement_fn = self._ctx.tpu_device_placement_function
-    for i in xrange(num_cores):
+    for i in xrange(self._ctx.num_cores):
       with ops.device(tpu_device_placement_fn(i)):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=self._tensor_dtypes, shapes=self._tensor_shapes)
+            dtypes=tensor_dtypes, shapes=tensor_shapes)
         for j, item in enumerate(outfeed_tensors):
           dequeue_ops[j].append(item)
 
-    # It is assumed evaluation always happends on single host TPU system. So,
+    # Deconstruct dequeue ops.
+    dequeue_ops_by_name = {}
+    pos = 0
+    for name in self._names:
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos+len(self._tensors[name])]
+      pos += len(self._tensors[name])
+
+    # It is assumed evaluation always happens on single host TPU system. So,
     # place all ops on tpu host if possible.
+    #
+    # TODO(jhseu): Evaluate whether this is right for summaries.
     with ops.device(self._ctx.tpu_host_placement_function(core_id=0)):
-      for i, item in enumerate(dequeue_ops):
-        if dequeue_ops[i][0].shape.ndims == 0:
-          raise RuntimeError(
-              'All tensors outfed from TPU should preseve batch size '
-              'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-        # TODO(xiejw): Allow users to specify the axis for batch size dimension.
-        dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+      for name in self._names:
+        dequeue_ops = dequeue_ops_by_name[name]
+        for i, item in enumerate(dequeue_ops):
+          if dequeue_ops[i][0].shape.ndims == 0:
+            raise RuntimeError(
+                'All tensors outfed from TPU should preserve batch size '
+                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
+          # TODO(xiejw): Allow users to specify the axis for batch size
+          # dimension.
+          dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+
+        if self._tensor_keys[name] is not None:
+          # The user-provided eval_metrics[1] is a dict.
+          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
+          try:
+            ret[name] = self._host_fns[name](**dequeue_ops)
+          except TypeError as e:
+            logging.warning(
+                'Exception while calling %s: %s. It is likely the tensors '
+                '(%s[1]) do not match the '
+                'function\'s arguments', name, e, name)
+            raise e
+        else:
+          ret[name] = self._host_fns[name](*dequeue_ops)
 
-      if self._is_dict:
-        dequeue_ops = dict(zip(self._tensor_keys, dequeue_ops))
-        try:
-          eval_metric_ops = self._metric_fn(**dequeue_ops)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling metric_fn for evalution: %s. '
-              'It is likely the tensors (eval_metrics[1]) do not match the '
-              'metric_fn arguments', e)
-          raise e
-      else:
-        eval_metric_ops = self._metric_fn(*dequeue_ops)
+    return ret
+
+
+class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
+  """Hook to run host calls when use_tpu=False."""
+
+  def __init__(self, tensors):
+    self._tensors = tensors
 
-    eval_update_ops = []
-    for k, v in eval_metric_ops.items():
-      eval_metric_ops[k] = (v[0], dummy_update_op)
-      eval_update_ops.append(v[1])
+  def begin(self):
+    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
+    # create a separate hook to guarantee execution order, because summaries
+    # need to be initialized before the outfeed thread starts.
+    # TODO(jhseu): Make a wrapper hook instead?
+    self._init_ops = contrib_summary.summary_writer_initializer_op()
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    self._finalize_ops = []
+    for op in self._init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_ops)
 
-    return eval_metric_ops, eval_update_ops
+  def before_run(self, run_context):
+    return basic_session_run_hooks.SessionRunArgs(self._tensors)
+
+  def end(self, session):
+    session.run(self._finalize_ops)
 
 
 class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
@@ -1348,12 +1507,30 @@ class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
   def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
     examples_per_sec = self._batch_size * elapsed_steps / elapsed_time
     if self._summary_writer is not None:
-      example_summary = Summary(value=[Summary.Value(
-          tag='examples_sec', simple_value=examples_per_sec)])
+      example_summary = Summary(value=[
+          Summary.Value(tag='examples_sec', simple_value=examples_per_sec)
+      ])
       self._summary_writer.add_summary(example_summary, global_step)
     logging.info('examples/sec: %g', examples_per_sec)
 
 
+class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
+  """Change SIGINT (CTRL^C) handler to force quit the process.
+
+  The default behavior often results in hanging processes.
+  The original handler is restored after training/evaluation.
+  """
+
+  def __init__(self):
+    self._signal_fn = signal.getsignal(signal.SIGINT)
+
+  def before_run(self, run_context):
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+  def end(self, session):
+    signal.signal(signal.SIGINT, self._signal_fn)
+
+
 class TPUEstimator(estimator_lib.Estimator):
   """Estimator with TPU support.
 
@@ -1361,30 +1538,28 @@ class TPUEstimator(estimator_lib.Estimator):
   replicating inputs and models for each core, and returning to host
   periodically to run hooks.
 
-  If `use_tpu` is false, all training, evaluation, and predict are executed on
-  CPU.
-
-  For training, TPUEstimator transforms a global batch size in params to a
-  per-shard batch size when calling the `input_fn` and `model_fn`. Users should
-  specify `train_batch_size` in constructor, and then get the batch size for
-  each shard in `input_fn` and `model_fn` by `params['batch_size']`. If
-  `TPUConfig.per_host_input_for_training` is `True`, `input_fn` is invoked per
-  host rather than per core. In this case, a global batch size is transformed a
-  per-host batch size in params for `input_fn`, but `model_fn` still gets
-  per-core batch size.
-
-  For evaluation, if `eval_batch_size` is None, it is executed on CPU, even if
-  `use_tpu` is `True`. If `eval_batch_size` is not `None`, it is executed on
-  TPU, which is an experimental feature. In this case, `model_fn` should return
-  `TPUEstimatorSpec` instead of `EstimatorSpec`, which expects the
-  `eval_metrics` for TPU evaluation.
-
+  TPUEstimator transforms a global batch size in params to a per-shard batch
+  size when calling the `input_fn` and `model_fn`. Users should specify
+  global batch size in constructor, and then get the batch size for each shard
+  in `input_fn` and `model_fn` by `params['batch_size']`.
+  For training, `model_fn` gets per-core batch size; `input_fn` may get
+  per-core or per-host batch size depending on
+  `per_host_input_for_training` in `TPUConfig`.
+  For evaluation, `model_fn` gets per-core batch size and `input_fn` get
+  per-host batch size.
+
+  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
+  for TPU evaluation.
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
   `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
   `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
   a dict from metric string name to the result of calling a metric function,
   namely a `(metric_tensor, update_op)` tuple.
 
+  One can set `use_tpu` to `False` for testing. All training, evaluation, and
+  predict will be executed on CPU. `input_fn` and `model_fn` will receive
+  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
+
   Current limitations:
 
   1. TPU evaluation only works on single host.
@@ -1457,8 +1632,7 @@ class TPUEstimator(estimator_lib.Estimator):
         basic python types. There are reserved keys for `TPUEstimator`,
         including 'batch_size'.
       use_tpu: A bool indicating whether TPU support is enabled. Currently,
-        - TPU training respects this bit.
-        - If true, see `eval_batch_size` for evaluate support.
+        - TPU training and evaluation respect this bit.
         - Predict still happens on CPU.
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
@@ -1466,9 +1640,7 @@ class TPUEstimator(estimator_lib.Estimator):
         Cannot be `None` if `use_tpu` is `True`. Must be divisible by
         `config.tpu_config.num_shards`.
       eval_batch_size: An int representing the global training batch size.
-        Currently, if `None`, evaluation is still executed on CPU (even when
-        `use_tpu` is True). In near future, `use_tpu` will be the only option to
-        switch between TPU/CPU evaluation.
+        Must be divisible by `config.tpu_config.num_shards`.
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards. For example, if your input_fn produced (images, labels)
@@ -1488,9 +1660,8 @@ class TPUEstimator(estimator_lib.Estimator):
           '`config` must be provided with type `tpu_config.RunConfig`')
 
     if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError(
-          '{} are reserved keys but existed in params {}.'.format(
-              _RESERVED_PARAMS_KEYS, params))
+      raise ValueError('{} are reserved keys but existed in params {}.'.format(
+          _RESERVED_PARAMS_KEYS, params))
 
     if use_tpu:
       if train_batch_size is None:
@@ -1509,10 +1680,10 @@ class TPUEstimator(estimator_lib.Estimator):
             .format(train_batch_size, config.tpu_config.num_shards))
 
       if eval_batch_size is not None:
-        if config.tpu_config.num_shards > 8:
-          raise NotImplementedError(
-              'TPU evaluation is only supported with one host.')
-
+        if not isinstance(eval_batch_size, int):
+          raise ValueError('`eval_batch_size` must be an int')
+        if eval_batch_size < 1:
+          raise ValueError('`eval_batch_size` must be positive')
         if eval_batch_size % config.tpu_config.num_shards != 0:
           raise ValueError(
               'eval batch size {} must be divisible by number of shards {}'
@@ -1571,8 +1742,9 @@ class TPUEstimator(estimator_lib.Estimator):
     if max_steps is not None:
       util_lib.check_positive_integer(max_steps, 'Train max_steps')
 
-    return [_TPUStopAtStepHook(self._iterations_per_training_loop, steps,
-                               max_steps)]
+    return [
+        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
+    ]
 
   def _convert_eval_steps_to_hooks(self, steps):
     with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
@@ -1584,6 +1756,14 @@ class TPUEstimator(estimator_lib.Estimator):
 
     util_lib.check_positive_integer(steps, 'Eval steps')
 
+    if self._config.tpu_config.num_shards > 8:
+      raise NotImplementedError(
+          'TPU evaluation is only supported with one host.')
+
+    if self._ctx._eval_batch_size is None:  # pylint: disable=protected-access
+      raise ValueError('`eval_batch_size` cannot be `None`'
+                       'if evaluate() is called on TPU.')
+
     return [
         evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
             num_evals=steps),
@@ -1640,6 +1820,7 @@ class TPUEstimator(estimator_lib.Estimator):
       # `features` in `model_fn` signature.
       def _input_fn():
         return input_fn(**kwargs)
+
       return _input_fn
 
   def _augment_model_fn(self, model_fn, batch_axis):
@@ -1661,22 +1842,31 @@ class TPUEstimator(estimator_lib.Estimator):
         input_fn = features
 
         input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn = (
+        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss, scaffold = (
+          loss, host_call, scaffold = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          host_ops = host_call.create_tpu_hostcall()
+          if host_ops is None:
+            host_ops = []
           hooks = [
-              TPUInfeedOutfeedSessionHook(ctx, enqueue_ops),
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  host_ops,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator)),
               ExamplesPerSecondHook(ctx.global_batch_size),
+              InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
                       'loss': array_ops.identity(loss),
                       'step': training.get_global_step()
                   },
                   every_n_secs=30)
-          ]
+          ] + input_hooks
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -1692,12 +1882,13 @@ class TPUEstimator(estimator_lib.Estimator):
               scaffold=scaffold)
 
         # Now eval.
-        total_loss, eval_metric_ops, scaffold = _eval_on_tpu_system(
+        total_loss, host_calls, scaffold = _eval_on_tpu_system(
             ctx, model_fn_wrapper, dequeue_fn)
         iterations_per_loop_var = _create_or_get_iterations_per_loop()
-        mean_loss = math_ops.div(
-            total_loss,
-            math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
+        mean_loss = math_ops.div(total_loss,
+                                 math_ops.cast(
+                                     iterations_per_loop_var,
+                                     dtype=total_loss.dtype))
 
         # Creates a dummy metric update_op for all metrics. Estimator expects
         # all metrics in eval_metric_ops have update_op and calls them one by
@@ -1713,11 +1904,24 @@ class TPUEstimator(estimator_lib.Estimator):
           with ops.control_dependencies(internal_ops_to_run):
             dummy_update_op = control_flow_ops.no_op()
 
-        eval_metric_ops, eval_update_ops = (
-            eval_metric_ops.to_metric_metric_ops_for_tpu(dummy_update_op))
+        host_call_ret = host_calls.create_tpu_hostcall()
+        eval_metric_ops = {}
+        eval_update_ops = []
+        for k, v in host_call_ret['eval_metrics'].items():
+          eval_metric_ops[k] = (v[0], dummy_update_op)
+          eval_update_ops.append(v[1])
+
+        if 'host_call' not in host_call_ret:
+          host_ops = []
+        else:
+          host_ops = host_call_ret['host_call']
         hooks = [
-            TPUInfeedOutfeedSessionHook(ctx, enqueue_ops, eval_update_ops),
-        ]
+            TPUInfeedOutfeedSessionHook(
+                ctx,
+                enqueue_ops,
+                eval_update_ops + host_ops,
+                run_infeed_loop_on_coordinator=run_infeed_loop_on_coordinator),
+        ] + input_hooks
 
         return model_fn_lib.EstimatorSpec(
             mode,
@@ -1725,6 +1929,7 @@ class TPUEstimator(estimator_lib.Estimator):
             evaluation_hooks=hooks,
             eval_metric_ops=eval_metric_ops,
             scaffold=scaffold)
+
     return _model_fn
 
 
@@ -1733,22 +1938,23 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_eval_step, eval_metric_ops, captured_scaffold_fn = (
+  single_tpu_eval_step, host_calls, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
 
   def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var,
-                                single_tpu_eval_step,
-                                [_ZERO_LOSS],
-                                name='loop')
+    return training_loop.repeat(
+        iterations_per_loop_var,
+        single_tpu_eval_step, [_ZERO_LOSS],
+        name='loop')
 
-  (loss,) = tpu.shard(multi_tpu_eval_steps_on_single_shard,
-                      inputs=[],
-                      num_shards=num_cores,
-                      outputs_from_all_shards=False)
+  (loss,) = tpu.shard(
+      multi_tpu_eval_steps_on_single_shard,
+      inputs=[],
+      num_shards=num_cores,
+      outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, eval_metric_ops, scaffold
+  return loss, host_calls, scaffold
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
@@ -1756,27 +1962,28 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_train_step, captured_scaffold_fn = (
+  single_tpu_train_step, host_call, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
         iterations_per_loop_var,
-        single_tpu_train_step,
-        [_INITIAL_LOSS],
+        single_tpu_train_step, [_INITIAL_LOSS],
         name=b'loop')
 
-  (loss,) = tpu.shard(multi_tpu_train_steps_on_single_shard,
-                      inputs=[],
-                      num_shards=num_cores,
-                      outputs_from_all_shards=False)
+  (loss,) = tpu.shard(
+      multi_tpu_train_steps_on_single_shard,
+      inputs=[],
+      num_shards=num_cores,
+      outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, scaffold
+  return loss, host_call, scaffold
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
   """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
   def computation(i):
     with ops.control_dependencies(op_fn()):
       return i + 1
@@ -1788,7 +1995,8 @@ def _wrap_computation_in_while_loop(device, op_fn):
     iterations = array_ops.identity(iterations_per_loop_var)
     return control_flow_ops.while_loop(
         lambda i: i < iterations,
-        computation, [constant_op.constant(0)], parallel_iterations=1)
+        computation, [constant_op.constant(0)],
+        parallel_iterations=1)
 
 
 def _validate_tpu_training_graph():
@@ -1801,8 +2009,9 @@ def _validate_tpu_training_graph():
 
   # Check if there is atleast one CrossReplicaSum operation in the graph
   # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [o for o in operations
-                           if o.type == _CROSS_REPLICA_SUM_OP]
+  cross_replica_sum_ops = [
+      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
+  ]
   if not cross_replica_sum_ops:
     raise ValueError(
         'CrossShardOptimizer must be used for model training on TPUs.')
@@ -1849,9 +2058,11 @@ def _get_scaffold(captured_scaffold_fn):
 
   if scaffold:
     wrapped_finalize = scaffold.finalize
+
     def _finalize():
       with _CapturingContext('Inside Scaffold.finalize'):
         wrapped_finalize()
+
     scaffold.finalize = _finalize
   return scaffold
 
@@ -1866,9 +2077,8 @@ class _CapturingContext(control_flow_ops.ControlFlowContext):
   def AddOp(self, op):  # pylint: disable=invalid-name
     for c in op.inputs:
       if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
-        raise ValueError(
-            '{}: Op {} depends on TPU computation {}, '
-            'which is not allowed.'.format(self._message, op, c))
+        raise ValueError('{}: Op {} depends on TPU computation {}, '
+                         'which is not allowed.'.format(self._message, op, c))
 
   def __enter__(self):
     # pylint: disable=protected-access
@@ -1879,3 +2089,60 @@ class _CapturingContext(control_flow_ops.ControlFlowContext):
 
   def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
     self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
+# TODO(xiejw): Extend this to support internal signal.
+class _Inputs(object):
+  """A data structure representing the input_fn returned values.
+
+  This also supports the returned value from input_fn as `Dataset`.
+  """
+
+  def __init__(self, features=None, labels=None, dataset=None):
+    if dataset is not None and (features is not None or labels is not None):
+      raise RuntimeError('Internal Error: Either (features and labels) or '
+                         'dataset should be provided, not both. Please file '
+                         'bug')
+
+    self._features = features
+    self._labels = labels
+
+    self._dataset = dataset
+    self._iterator = None
+
+  @staticmethod
+  def from_input_fn(return_values):
+    """Returns an `_Inputs` instance according to `input_fn` return value."""
+    if isinstance(return_values, dataset_ops.Dataset):
+      dataset = return_values
+      return _Inputs(dataset=dataset)
+
+    if isinstance(return_values, tuple):
+      features, labels = return_values
+    else:
+      features, labels = return_values, None
+    return _Inputs(features, labels)
+
+  @property
+  def is_dataset(self):
+    """Returns True if the return value from input_fn is Dataset."""
+    return self._dataset is not None
+
+  def dataset_initializer_hook(self):
+    """Returns a `SessionRunHook` to initialize this dataset.
+
+    This must be called before `features_and_labels`.
+    """
+    iterator = self._dataset.make_initializable_iterator()
+    # pylint: disable=protected-access
+    hook = estimator_lib._DatasetInitializerHook(iterator)
+    self._iterator = iterator
+    return hook
+
+  def features_and_labels(self):
+    """Gets `features` and `labels`."""
+    if self.is_dataset:
+      return (_Inputs.from_input_fn(
+          self._iterator.get_next()).features_and_labels())
+
+    return (self._features, self._labels)
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index cccaa2b833ee764921508a5b6d6affe0b8822ede..6db373d2d5e20ea7da449530b2730403c3bb64cc 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,6 +26,7 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
+        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -285,6 +286,28 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tensor_queue_dataset_test",
+    size = "large",
+    srcs = ["python/training/tensor_queue_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":training_py",
+        "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 2a0ef0e6b3750b4f0464f1f4390819e1fc2c7872..dbdbb08a8252c799924812c83fff7f0631424761 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -53,7 +53,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
     sp_tensor1 = sparse_tensor.SparseTensor(
         array_ops.constant(ind1, dtypes.int64),
         array_ops.constant(val1, dtypes.int64),
-        array_ops.constant(shape1, dtypes.int64))
+        array_ops.placeholder_with_default(shape1, shape=[2]))
     ind2 = np.array([
         [0, 0, 1],
         [0, 1, 0],
@@ -68,7 +68,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
     sp_tensor2 = sparse_tensor.SparseTensor(
         array_ops.constant(ind2, dtypes.int64),
         array_ops.constant(val2, dtypes.int64),
-        array_ops.constant(shape2, dtypes.int64))
+        array_ops.placeholder_with_default(shape2, shape=[3]))
     sp_tensor3 = sparse_tensor.SparseTensor(
         array_ops.constant([[1, 9], [2, 2], [2, 10]], dtypes.int64),
         array_ops.constant([7, 15, 2], dtypes.int64),
@@ -320,6 +320,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
   def testNotAMultiple(self):
     num_unroll = 3  # Not a divisor of value_length -
     # so padding would have been necessary.
+
+    # Use placeholder_with_default in sequences to make sure we get runtime
+    # error instead of shape inference error
+    sequences = {
+        "seq1": array_ops.placeholder_with_default(self.sequences["seq1"],
+                                                   shape=(None, 5)),
+        "seq2": array_ops.placeholder_with_default(self.sequences["seq2"],
+                                                   shape=(None, 4, 2)),
+        "seq3": self.sequences["seq3"],
+        "seq4": self.sequences["seq4"],
+    }
+
     with self.test_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    ".*should be a multiple of: 3, but saw "
@@ -330,7 +342,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
           with coord.stop_on_exception():
             next_batch = sqss.batch_sequences_with_states(
                 input_key=self.key,
-                input_sequences=self.sequences,
+                input_sequences=sequences,
                 input_context=self.context,
                 input_length=3,
                 initial_states=self.initial_states,
@@ -493,6 +505,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
         expected_seq4_batch2=expected_seq4_batch2)
 
 
+class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
+
+  def setUp(self):
+    self._prev_value = ops._USE_C_API
+    ops._USE_C_API = True
+    super(BatchSequencesWithStatesTestWithCApi, self).setUp()
+
+  def tearDown(self):
+    super(BatchSequencesWithStatesTestWithCApi, self).tearDown()
+    ops._USE_C_API = self._prev_value
+
+
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..409aba817c1ec37003eb98f000f6cf8918234c5d
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
@@ -0,0 +1,200 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import nest as tf_nest
+
+
+class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.Dataset):
+  """A `Dataset` that prepends a queue to another `Dataset`.
+
+  A vector of handles to the queue is returned as the first component of
+  the associated iterator.  This vector can be passed to
+  `enqueue_in_queue_dataset` to add new elements to the queue.
+  """
+
+  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
+    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
+    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__()
+    if sparse.any_sparse(input_dataset.output_classes):
+      raise TypeError(
+          "Batching of padded sparse tensors is not currently supported")
+    self._input_dataset = input_dataset
+    self._batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+    # pylint: disable=protected-access
+    if padded_shapes is None:
+      self._padded_shapes = nest.map_structure(
+          dataset_ops._partial_shape_to_tensor, input_dataset.output_shapes)
+    else:
+      self._padded_shapes = nest.map_structure_up_to(
+          input_dataset.output_shapes, dataset_ops._partial_shape_to_tensor,
+          padded_shapes)
+    padding_values = (
+        padding_values if padding_values is not None else
+        dataset_ops._default_padding(input_dataset))
+    self._padding_values = nest.map_structure_up_to(
+        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
+        padding_values, input_dataset.output_types)
+    # pylint: enable=protected-access
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        batch_size=self._batch_size,
+        padded_shapes=[
+            ops.convert_to_tensor(s, dtype=dtypes.int64)
+            for s in nest.flatten(self._padded_shapes)
+        ],
+        padding_values=nest.flatten(self._padding_values),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return (ops.Tensor, self._input_dataset.output_classes)
+
+  def _as_batch_shape(self, shape_like):
+    return tensor_shape.vector(None).concatenate(
+        tensor_util.constant_value_as_shape(shape_like))
+
+  @property
+  def output_shapes(self):
+    # First output is a variant representing the Queue
+    return (tensor_shape.vector(None),
+            nest.map_structure(self._as_batch_shape, self._padded_shapes))
+
+  @property
+  def output_types(self):
+    # First output is a variant representing the Queue
+    return (dtypes.variant, self._input_dataset.output_types)
+
+
+def prepend_from_queue_and_padded_batch_dataset(batch_size,
+                                                padding_values=None,
+                                                padded_shapes=None):
+  """A transformation that prepends a queue to a `Dataset` and batches results.
+
+  A vector of handles to the queue is returned as the first component of the
+  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
+  to add new elements to the queue.
+
+  Below is an example of how this dataset might be used to split incoming
+  variable-length sequences into "head" and "rest" parts, where "rest" parts
+  are re-enqueued back into the dataset.  A more realistic example would
+  perform some calculation on the "head" and modify some components of "rest"
+  with the result (before re-enqueueing).
+
+  ```python
+  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
+  # Make a dataset of variable-length vectors and their lengths.
+  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
+  # Emit a queue we can prepend to, and counts/values as padded batch.
+  dataset = dataset.apply(
+      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
+        batch_size=10))
+  dataset = dataset.prefetch(1)
+
+  iterator = dataset.make_one_shot_iterator()
+  queue, (count, padded_value) = iterator.get_next()
+
+  # Split the padded_value into two pieces: head and rest
+  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
+  bound = tf.minimum(3, tf.reduce_max(count))
+  value_head = padded_value[:, :bound]
+  count_rest = tf.gather(count - 3, rest_indices)
+  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
+  queue_rest = tf.gather(queue, rest_indices)
+  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
+    queue_rest, (count_rest, value_rest))
+  with tf.control_dependencies([enqueue_rest_op]):
+    calculation = fn(value_head)
+
+  while True:  # Will raise OutOfRange when finished with all pieces.
+    session.run(calculation)
+  ```
+
+  Args:
+    batch_size: `int64` scalar tensor.  The batch size to use when performing
+      padded batching.
+    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
+      the structure and dtypes of padding_values should match that of
+      incoming dataset's `output_types`.
+    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
+      If provided, the structure must match that of the incoming dataset's
+      `output_types`.  If not provided, the incoming dataset's `output_shapes`
+      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
+      treated as being unique per-batch: for each batch time, an unknown
+      dimension is replaced with the maximum given value of this dimension
+      across all tensors for the given component in the batch.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _PrependFromQueueAndPaddedBatchDataset(
+        dataset,
+        batch_size=batch_size,
+        padding_values=padding_values,
+        padded_shapes=padded_shapes)
+
+  return _apply_fn
+
+
+def enqueue_in_queue_dataset(queue, components):
+  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
+
+  The components' dtypes and shapes must be compatible with the `output_shapes`
+  attribute of the `dataset` created by
+  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
+  non-batched and batched modes.
+
+  For more details, see the example in the docstring for
+  `prepend_from_queue_and_padded_batch_dataset`.
+
+  Args:
+    queue: `variant` scalar or vector tensor.
+      The tensor emitted by the first component of the iterator associated with
+      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
+      then the `components` input tensors should not have a prepended batch
+      dimension.
+    components: Nested tuple of tensors, each with a leading batch dimension
+      if `queue` is a vector.  The structure, dtypes, and shapes
+      (excluding batch dimension) must match the nested tuples
+      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
+      output types and shapes) of the `dataset` emitted by
+      the original `prepend_from_queue_and_padded_batch_dataset` call.
+
+  Returns:
+    An `Operation` that enqueues `components` into the dataset(s) associated
+    with entries of `queue`.
+  """
+  return gen_dataset_ops.enqueue_in_queue_dataset(
+      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0338f409a203c232e63e99534a8f6d6a43fa661e
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
@@ -0,0 +1,355 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TensorQueueDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
+
+  def testNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
+    self.assertAllEqual(([None],) * 2,
+                        [x.as_list() for x in dataset.output_shapes])
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertEqual([0], self.evaluate(value))
+    self.assertEqual([1], self.evaluate(value))
+    self.assertEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([0, 1], self.evaluate(value))
+    self.assertAllEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=2, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
+    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0, 0]], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[1, 0, 0]], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[-1, 0, 0]], value_2)
+      value_3 = sess.run(value)
+      self.assertAllEqual([[1, 0, 0]], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[2, 0, 0]], value_4)
+      value_5 = sess.run(value)
+      self.assertAllEqual([[-2, 0, 0]], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertEqual([0], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([-1], value_2)
+      value_3 = sess.run(value)
+      self.assertEqual([1], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([2], value_4)
+      value_5 = sess.run(value)
+      self.assertEqual([-2], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testBatchedOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
+                                                  array_ops.expand_dims(
+                                                      value[0], axis=0))
+    with self.test_session() as sess:
+      value_0, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 1], value_0)
+      value_1, _ = sess.run([value, enqueue_zeroth])
+      self.assertAllEqual([0, -1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 2], value_2)
+      self.assertAllEqual([0, -2], sess.run(value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testManyEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_many_more = [
+        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
+        for i in range(1000)
+    ]
+    with self.test_session() as sess:
+      value_0, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual([0], value_0)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
+      # Going back to the original input.
+      value_1, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual(1, value_1)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testEnqueueWithPrefetch(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    # Prefetching will request additional values before they are
+    # available to the queue.
+    dataset = dataset.prefetch(buffer_size=3)
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
+    with self.test_session() as sess:
+      i = 0
+      while i < 4:
+        received, _ = sess.run((value, enqueue))
+        if received.size > 0:
+          self.assertAllEqual([i], received)
+          i += 1
+      received_last = False
+      while True:
+        try:
+          received = sess.run(value)
+          if received.size > 0:
+            self.assertAllEqual([4], received)
+            received_last = True
+        except errors.OutOfRangeError:
+          break
+      self.assertTrue(received_last)
+
+  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[2]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          r"Incompatible input shapes at component 0 between "
+          r"input dataset this dataset: \[3\] vs. \[2\]"):
+        sess.run(value)
+
+  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+
+    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
+        queue_handle, (value, value))
+    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [1.0],
+                                                         dtype=np.float32))
+    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
+        queue_handle, ([1],))
+    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [[1]], dtype=np.int32))
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          "mismatched number of tensors.  Queue expects 1 tensors but "
+          "tried to insert 2"):
+        sess.run(enqueue_bad_structure)
+      with self.assertRaisesOpError(r"Expected component 0 to have batched "
+                                    r"shape \[1,...\], but saw shape: \[\]"):
+        sess.run(enqueue_bad_shape_no_batch_dim)
+      with self.assertRaisesOpError(
+          r"mismatched shapes at component 0.  Attempted to insert tensor "
+          r"with shape \[1\] but queue expected shape: \[\]"):
+        sess.run(enqueue_bad_shape)
+      with self.assertRaisesOpError(
+          r"mismatched dtypes at component 0.  Attempted to insert tensor "
+          r"of type float but queue expected type: int32"):
+        sess.run(enqueue_bad_dtype)
+
+  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    with self.assertRaisesRegexp(
+        TypeError, r"Unable to create padding for field of type 'variant'"):
+      dataset.padded_batch(batch_size=10, padded_shapes=[1])
+
+  def testOneEnqueueWithPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
+
+    iterator = dataset.make_one_shot_iterator()
+    queue, (count, padded_value) = iterator.get_next()
+
+    # Split the padded_value into two pieces: head and rest
+    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
+    bound = math_ops.minimum(2, math_ops.reduce_max(count))
+    value_head = padded_value[:, :bound]
+    count_rest = array_ops.gather(count - 2, rest_indices)
+    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
+    queue_rest = array_ops.gather(queue, rest_indices)
+    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
+                                                   (count_rest, value_rest))
+    with ops.control_dependencies([enqueue_rest_op]):
+      calc = array_ops.identity(value_head)
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
+      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      # Get some final batches due to prefetching.
+      for _ in range(3):
+        try:
+          self.assertAllEqual(
+              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
+        except errors.OutOfRangeError as e:
+          self.assertTrue(str(e).startswith("End of sequence"))
+
+  def testNonstandardPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=3, padding_values=(
+                0,
+                -1,
+            )))
+
+    iterator = dataset.make_one_shot_iterator()
+    _, (unused_count, padded_value) = iterator.get_next()
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
+                          sess.run(padded_value))
+      self.assertAllEqual([[6] * 6], sess.run(padded_value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(padded_value)
+
+
+# TODO(ebrevdo): Figure out how to use run_core_tests to test state
+# saving of an iterator that's had some tensors enqueued into its queue.
+class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testPrependFromQueueAndPaddedBatch(self):
+
+    def build_dataset(seq_lens):
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          lambda x: array_ops.fill([x], x)).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
+
+    def build_dataset(seq_lens):
+
+      def fill_tuple(x):
+        filled = array_ops.fill([x], x)
+        return (filled, string_ops.as_string(filled))
+
+      padded_shape = [-1]
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          fill_tuple).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(
+                  batch_size=4,
+                  padded_shapes=(padded_shape, padded_shape),
+                  padding_values=(-1, "<end>")))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
index 2992a61ea8186caada394208e9c27ddffe896dd1..9675428e56e93c9669753371dbca47d56325b0c4 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
@@ -142,9 +142,9 @@ Status ConvertConstantsToImmutable(const string& in_graph_filename,
   const auto load_graph_status =
       ReadBinaryProto(default_env, in_graph_filename, &graph_def);
   if (!load_graph_status.ok()) {
-    return tensorflow::errors::NotFound("Failed to load graph at '",
-                                        in_graph_filename, "' : ",
-                                        load_graph_status.error_message());
+    return tensorflow::errors::NotFound(
+        "Failed to load graph at '", in_graph_filename,
+        "' : ", load_graph_status.error_message());
   }
 
   NodeConverter node_converter;
diff --git a/tensorflow/contrib/util/inspect_checkpoint.cc b/tensorflow/contrib/util/inspect_checkpoint.cc
index 39088aeaad68e26344b2e89ce10ae6da8026e481..9b578ceb07548b8d198f64bc859d31c92774a286 100644
--- a/tensorflow/contrib/util/inspect_checkpoint.cc
+++ b/tensorflow/contrib/util/inspect_checkpoint.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 47ed83f521c5e6165c906ea557e74faf27df2112..1a0b5028febb7b11f979abd179a3227a2615252d 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -49,8 +49,8 @@ VerbsServer::~VerbsServer() {
 Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
                                         GrpcChannelCache** channel_cache) {
   string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:",
-                      server_def.task_index());
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
+                      "/task:", server_def.task_index());
 
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 94973a0e520e494ce2ccc947a803e10681ff5e21..a8a8c34846a6bc0ccf46bec70c88b42b7abba01f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -381,7 +381,6 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":abi",
         ":lib",
         ":lib_platform",
     ],
@@ -611,6 +610,7 @@ tf_gen_op_libs(
         "list_ops",
         "lookup_ops",
         "logging_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
@@ -693,6 +693,7 @@ cc_library(
         ":list_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
+        ":manip_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
         ":no_op_op_lib",
@@ -830,6 +831,7 @@ cc_library(
         "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
+        "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
         "//tensorflow/core/kernels:nn",
@@ -1152,6 +1154,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
@@ -1352,6 +1355,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "protobuf/device_properties_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "protobuf/device_properties.proto",
+    visibility = ["//visibility:public"],
+)
+
 # -----------------------------------------------------------------------------
 # Internal targets
 
@@ -1896,6 +1906,14 @@ cc_library(
     ],
 )
 
+tf_cuda_library(
+    name = "cuda_device_functions",
+    hdrs = ["util/cuda_device_functions.h"],
+    cuda_deps = ["//third_party_gpus/cuda:cuda_headers"],
+    visibility = ["//visibility:public"],
+    deps = [":framework_lite"],
+)
+
 # TODO(josh11b): Is this needed, or can we just use ":protos_all_cc"?
 cc_library(
     name = "protos_cc",
@@ -2406,7 +2424,6 @@ cc_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":stacktrace_handler",
         ":test",  # buildcleaner: keep
         "//tensorflow/core/platform/default/build_config:test_main",
     ],
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
index 5d21d7bab699ff481c65ed44eb9bf66ec14ea387..ac05b54eea95f70e4a6db843aab13adf7b94602c 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Adds a value to the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the incremented value or a subsequent newer one.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
index 102201c4cb07b080c46a28a91af8a4176034f6e6..9dd28f8711222754da531a961e1029f5bcf35ce8 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Subtracts a value from the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the decremented value or a subsequent newer one.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9722f5ede30cb0b893171bfc36a0eb8c1ab3c7e2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EnqueueInQueueDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
index a72f2bfe5fc90ed7055a0d5354af81f8eee6a7d8..118d0e2178ada7c7b217cd43297830d2bae84f1c 100644
--- a/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
@@ -30,9 +30,8 @@ END
   attr {
     name: "resize_align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1),
-which exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
index 8da76684e5d360dd642167100b04543e93beed0a..97fd39f6478edd87cdbeac0c08928a4c4d451ed5 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
@@ -16,5 +16,6 @@ END
   description: <<END
 Note that this routine only supports wildcard characters in the
 basename portion of the pattern, not in the directory portion.
+Note also that the order of filenames returned can be non-deterministic.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4549340fac6d59cc994050e65f5a0016f2d52ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
index 6b3ba72e530043746a33a9467594ed8fb49dd2e5..a08ed710b72709e3bed8e682d53076eb66ac783e 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize quantized `images` to `size` using quantized bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
index 6dc321a54496ea3f91b7efe0d28c8596cd18fc1c..317ad263ccac28f654ad6e44baec9f99b882d455 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using area interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
index 06e645e3ee937f81e3b546d24250f1c1d6ad2680..d4f8233d25be656c0440b4547104b2bacd3d776e 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using bicubic interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
index bf5201d82e4b7b6fb463e17138e92f3033c6992e..eeb0680ab8f88b6f8543d06c30ab5dadc42f5545 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
@@ -25,9 +25,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of bicubic interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
index 0768e437fa00a9adeec00498e968986125602822..0673baa703946b40d9294e1f381ad162c06091d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
index fba64203c236399e79a051206e936ec3ebb27b14..9a1a5fb69a5bfd0d537ddeede2f5af3856856f9b 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
@@ -25,9 +25,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
index a74db4c9dc340b90817567751da110ef8989850f..e6f8dc1941be24d457bb455f8c20131fe84d173c 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using nearest neighbor interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
index 4ef1547eb4fab02392bc2b98a21ef01340b621f3..8d52ca833479d63dc70884b8a6cc06762d16edb7 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
@@ -24,9 +24,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of nearest neighbor interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b308ad1f9d2f9d500cec4314b32b87541fe2348f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "Roll"
+  in_arg {
+    name: "shift"
+    description: <<END
+Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+elements are shifted positively (towards larger indices) along the dimension
+specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+direction.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+`shift[i]` should occur. If the same axis is referenced more than once, the
+total shift for that axis will be the sum of all the shifts that belong to that
+axis.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has the same shape and size as the input. The elements are shifted
+positively (towards larger indices) by the offsets of `shift` along the
+dimensions of `axis`.
+END
+  }
+  summary: "Rolls the elements of a tensor along an axis."
+  description: <<END
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97c380700a2a951a46a95f5b8abe432e759a32bf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnravelIndex"
+  in_arg {
+    name: "indices"
+    description: <<END
+An 0-D or 1-D `int` Tensor whose elements are indices into the
+flattened version of an array of dimensions dims.
+END
+  }
+  in_arg {
+    name: "dims"
+    description: <<END
+An 1-D `int` Tensor. The shape of the array to use for unraveling
+indices.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+same shape as the indices array.
+END
+  }
+  summary: "Converts a flat index or array of flat indices into a tuple of"
+  description: <<END
+coordinate arrays.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index a1e3b21e4f2d6af1b7e3c68d82a77f96bd34e613..832a55f2556f46efe6a94fb62d0420330917faac 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -13,11 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/node_builder.h"
 
-
 namespace tensorflow {
 namespace {
 
@@ -44,7 +42,6 @@ Tensor make_zeros(const DataType& dtype, const TensorShapeProto& shape) {
 // third-party libraries aren't currently supported.
 class AccumulateNV2RemovePass : public GraphOptimizationPass {
  public:
-
   Status Run(const GraphOptimizationPassOptions& options) override {
     // TODO(freiss.oss@gmail.com): Substantial shared code with
     // ParallelConcatRemovePass::Run(). Consider refactoring if someone makes
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 3dd011a58e4724a8db34703ec68055c3a3a26fa3..93539977538a3be450e1d747561f9c02384e86d0 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -127,10 +127,10 @@ class BFCAllocator : public VisitableAllocator {
     string DebugString(BFCAllocator* a,
                        bool recurse) NO_THREAD_SAFETY_ANALYSIS {
       string dbg;
-      strings::StrAppend(&dbg, "  Size: ", strings::HumanReadableNumBytes(size),
-                         " | Requested Size: ",
-                         strings::HumanReadableNumBytes(requested_size),
-                         " | in_use: ", in_use());
+      strings::StrAppend(
+          &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
+          " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
+          " | in_use: ", in_use());
       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
         Chunk* p = a->ChunkFromHandle(prev);
         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 0507076c8c3734083ac0ef7ffea0edebf180ad1a..fd9c4222a7afd4914415c9c62e1ced118ea75d1f 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -88,7 +88,9 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
   // D3 is prioritized below D1.
   AddDevice("d3", "/job:a/replica:0/task:0/device:d3:0");
   EXPECT_EQ((std::vector<DeviceType>{
-                DeviceType("d2"), DeviceType("d1"), DeviceType("d3"),
+                DeviceType("d2"),
+                DeviceType("d1"),
+                DeviceType("d3"),
             }),
             types());
 }
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index e9bdd922bae74cab225551d9c57febe158b0f0cd..df6f4b88773fb1a72100d1c223276a06b857a908 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -61,7 +61,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
-
 namespace tensorflow {
 
 namespace {
@@ -472,9 +471,9 @@ Status DirectSession::Run(const RunOptions& run_options,
   Executor::Args args;
   args.step_id = step_id_counter_.fetch_add(1);
 
-  TF_RETURN_IF_ERROR(
-      GetOrCreateExecutors(input_tensor_names, output_names, target_nodes,
-                           &executors_and_keys, &run_state_args));
+  TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_tensor_names, output_names,
+                                          target_nodes, &executors_and_keys,
+                                          &run_state_args));
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
@@ -1143,8 +1142,8 @@ Status DirectSession::GetOrCreateExecutors(
     options.debug_options = run_state_args->debug_options;
   }
 
-  std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
   std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
+  std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
 
   // The executor_lock_ is intentionally released while executor is
   // being created.
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 99b33e2ef0d532aca08dfb538857d347d22a7351..b75a4f76d94f704cf38a6c4657b6089a863c085f 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -436,10 +436,7 @@ TEST(DirectSessionTest, FetchMultipleTimes) {
   }
 }
 
-REGISTER_OP("Darth")
-    .Input("x: float")
-    .Output("y: float")
-    .Doc(R"doc(
+REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc(
 Darth promises one return value.
 
 x: float
@@ -972,39 +969,38 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
 
   std::atomic<int32> num_done(0);
   // Runs session to compute <node>:0 using inter_op thread pool <pool>.
-  auto add_session_run_call = [use_global_pools, &def, &options, &sessions,
-                               &sessions_mu,
-                               &num_done](thread::ThreadPool* tp, Node* node,
-                                          int inter_op_pool) {
-    auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
-               inter_op_pool, node, &num_done]() {
-      RunOptions run_options;
-      run_options.set_inter_op_thread_pool(inter_op_pool);
-      std::vector<Tensor> outputs;
-
-      Session* session;
-      if (use_global_pools) {
-        std::unique_ptr<Session> s(NewSession(options));
-        TF_ASSERT_OK(s->Create(def));
-        session = s.get();
-
-        mutex_lock l(sessions_mu);
-        sessions.emplace_back(std::move(s));
-      } else {
-        session = sessions[0].get();
-      }
+  auto add_session_run_call =
+      [use_global_pools, &def, &options, &sessions, &sessions_mu, &num_done](
+          thread::ThreadPool* tp, Node* node, int inter_op_pool) {
+        auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
+                   inter_op_pool, node, &num_done]() {
+          RunOptions run_options;
+          run_options.set_inter_op_thread_pool(inter_op_pool);
+          std::vector<Tensor> outputs;
+
+          Session* session;
+          if (use_global_pools) {
+            std::unique_ptr<Session> s(NewSession(options));
+            TF_ASSERT_OK(s->Create(def));
+            session = s.get();
+
+            mutex_lock l(sessions_mu);
+            sessions.emplace_back(std::move(s));
+          } else {
+            session = sessions[0].get();
+          }
 
-      Status s = session->Run(run_options, {} /* inputs */,
-                              {node->name() + ":0"} /* output_names */, {},
-                              &outputs, nullptr /* run_metadata */);
-      TF_CHECK_OK(s);
-      ASSERT_EQ(1, outputs.size());
-      auto flat = outputs[0].flat<float>();
-      EXPECT_FLOAT_EQ(1.2, flat(0));
-      num_done.fetch_add(1);
-    };
-    tp->Schedule(fn);
-  };
+          Status s = session->Run(run_options, {} /* inputs */,
+                                  {node->name() + ":0"} /* output_names */, {},
+                                  &outputs, nullptr /* run_metadata */);
+          TF_CHECK_OK(s);
+          ASSERT_EQ(1, outputs.size());
+          auto flat = outputs[0].flat<float>();
+          EXPECT_FLOAT_EQ(1.2, flat(0));
+          num_done.fetch_add(1);
+        };
+        tp->Schedule(fn);
+      };
 
   // For blocking states:
   // - Starts at 0, BlockingOp::Compute will move to 1.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index df9cf0c91f1b7e5521061b6915fc1b7ed609e003..31fb128f937ae46eefb309fc9bab8167e54846a7 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -161,14 +161,14 @@ static void TestHWAccelerator(bool enableHWTrace) {
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
-y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif  // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -181,7 +181,7 @@ y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
   (*options.config.mutable_device_count())["GPU"] = 1;
 #ifdef TENSORFLOW_USE_SYCL
   (*options.config.mutable_device_count())["SYCL"] = 1;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 9d03caff1e1e89c4c667f94853352580545e70e5..f515590b283c049d4b463feff837d39f1433037c 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1609,7 +1609,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsWrapper* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;  // Shorthand
+          Entry* first_input = state->first_input;     // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index e9c4328f29e2c941afd8e14142beb0db224110d8..150fb85c7078c3cd243f5ab91752e9760cc2ccd5 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -205,7 +205,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
-    const Graph* graph = nullptr;  // Owned by exec.
+    const Graph* graph = nullptr;                            // Owned by exec.
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 9e4b617d2bd5b070f5b8bdeedabb15b94d212743..67caeb3495c6b0600f12c9b20ef73ee90f8b3e0d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -154,8 +154,9 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
     a.DeallocateRaw(t3);
     a.DeallocateRaw(t4);
   }
-  CheckStats(&a, 4097, 0, 1024 * sizeof(float) + 1048576 * sizeof(int64) +
-                              2048 * sizeof(double) + 10485760 * sizeof(float),
+  CheckStats(&a, 4097, 0,
+             1024 * sizeof(float) + 1048576 * sizeof(int64) +
+                 2048 * sizeof(double) + 10485760 * sizeof(float),
              10485760 * sizeof(float));
 
   // At the end, we should have coalesced all memory into one region
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 0e5b6b7ef87f67bcb0b46d6e0acec82f8612b80f..80a5bdbfff4ddfc40eb6beba619cd97c308b04c9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -762,9 +762,11 @@ int64 MinSystemMemory(int64 available_memory) {
   // is necessary.
   min_system_memory *= 2;
 #endif
-#if defined(NVIDIA_TEGRA)
-  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM
-  min_system_memory = 1<<30;
+
+#if defined(ANDROID_TEGRA)
+  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
+  // and Video RAM
+  min_system_memory = 1 << 30;
 #endif
   return min_system_memory;
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index 7763a4f2e6f50292e78b4d16d8d4a3ee84d4163b..2500425359c424fa479af6dd34d6a0312c404577 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -108,7 +108,8 @@ TEST_F(GpuStreamUtilTest, StreamOverrides) {
   ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
              "/device:GPU:0");
   Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0");
+  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0,
+             "/cpu:0");
   Graph g(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&g));
 
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 995fd1253fb9f352742410199174b8567e92351b..b195de7cbace095cfb29fa2adf1ee5f44853cab5 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -88,8 +88,8 @@ ProcessState::~ProcessState() {
 }
 
 string ProcessState::MemDesc::DebugString() {
-  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
-                         gpu_registered, ", nic: ", nic_registered);
+  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
+                         ", dma: ", gpu_registered, ", nic: ", nic_registered);
 }
 
 ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
@@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // TODO(tucker): actually maintain separate CPUAllocators for
   // different numa_nodes.  For now, just one.
   numa_node = 0;
-  mutex_lock lock(mu_);
 
+  {
+    // Here we optimize the most common use case where cuda_host_allocators_
+    // and cuda_al_ have already been populated and since we're only reading
+    // these vectors, we can get by with a shared lock. In the slower case,
+    // we take a unique lock and populate these vectors.
+    tf_shared_lock lock(mu_);
+
+    if (FLAGS_brain_gpu_record_mem_types &&
+        static_cast<int>(cuda_al_.size()) > 0) {
+      return cuda_al_[0];
+    }
+    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
+      return cuda_host_allocators_[0];
+    }
+  }
+
+  mutex_lock lock(mu_);
   // Find the first valid StreamExecutor to request CUDA host memory
   // through, since any will work.
   //
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 3b309e915cdd2c6d5eead9ed0312f3873bcf7335..33a5d60eb7ec4de829d3c0784f909ef42cf994d1 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -340,8 +340,11 @@ Status GraphExecutionState::OptimizeGraph(
     std::unordered_map<string, DeviceProperties> device_map;
     Device* cpu_device = nullptr;
     for (const auto& device : device_set_->devices()) {
-      device_map[device->name()] =
-          grappler::GetDeviceInfo(device->parsed_name());
+      DeviceProperties props = grappler::GetDeviceInfo(device->parsed_name());
+      if (props.type() == "UNKNOWN") {
+        continue;
+      }
+      device_map[device->name()] = props;
       if (device->parsed_name().id == 0 &&
           StringPiece(device->parsed_name().type) == "CPU" &&
           device->GetAllocator(AllocatorAttributes()) != nullptr) {
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index db2686ce2c45aa4c9997a624bb12720d63710b65..2312e1a89fd1fd5734fab4316c25ca2e39f16ae5 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -139,9 +139,7 @@ class GraphExecutionState {
 
   // The graph returned by BuildGraph may contain only the pruned
   // graph, whereas some clients may want access to the full graph.
-  const Graph* full_graph() {
-    return graph_;
-  }
+  const Graph* full_graph() { return graph_; }
 
   // Returns the node with the given name, or null if it does not exist.
   const Node* get_node_by_name(const string& name) const {
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 76b926ba40053288360f0e4e6fe2a37bd44ff0b4..090a16ebeb10007261666aeb6491a1785dd2e5c4 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -47,7 +47,7 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     const DeviceType& device_type, const Graph* g,
     const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
-  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) {
+  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL) {
     // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
     return Status::OK();
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index 2a834ddca4236c626c6252f63c97118e8e1f0bd0..a093585571994e8b161b46a7fc397cdc3cd4254c 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -36,7 +36,7 @@ TEST(MemoryTypeChecker, Int32OK) {
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -64,7 +64,7 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   // But we can insert _HostSend/_HostRecv to ensure the invariant.
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -91,7 +91,7 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index c5b76592e1b4b86863009ef42b7bb7106377d054..75dce7c7feb2269fc994cbb8c5efd4b3799e75dd 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -88,9 +88,9 @@ class Placer {
   void AssignAndLog(int assigned_device, Node* node) const;
   void LogDeviceAssignment(const Node* node) const;
 
-  Graph* const graph_;                           // Not owned.
-  const DeviceSet* const devices_;               // Not owned.
-  const SessionOptions* options_;                // Not owned.
+  Graph* const graph_;              // Not owned.
+  const DeviceSet* const devices_;  // Not owned.
+  const SessionOptions* options_;   // Not owned.
   const bool log_device_placement_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Placer);
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 5d87b1e279ab0390a642df8f285fd451803ba29a..02c9cd5313ee24c83243d27baf688341451996c5 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -619,9 +619,9 @@ TEST_F(PlacerTest, TestReferenceConnectionIgnoreInfeasible) {
     Node* input = ops::SourceOp(
         "TestDevice",
         b.opts().WithName("in").WithDevice("/job:a/task:0/device:fakegpu:0"));
-    Node* var = ops::SourceOp("TestVariable",
-                              b.opts().WithName("var_0").WithDevice(
-                                  "/job:a/task:0/device:fakegpu:0"));
+    Node* var =
+        ops::SourceOp("TestVariable", b.opts().WithName("var_0").WithDevice(
+                                          "/job:a/task:0/device:fakegpu:0"));
 
     // This op is specified on CPU, but in practice will be ignored,
     // because the reference edges forces it on GPU.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 12947e284a36fef171caf6af0c46d59ca89efb61..dd4bf6a34574fabf81d0c3c1dac2519d759599f5 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -158,7 +158,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
 }
 
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
-    const string& device_name) {
+    const string& device_name) const {
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a1adc4b6b35950339b727774c45014ef71839554..9c9c92f1ea026023b01fe4b719943245d3060b37 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -85,7 +85,7 @@ class ProcessFunctionLibraryRuntime {
 
   static const char kDefaultFLRDevice[];
   // Returns the FunctionLibraryRuntime for the corresponding device_name.
-  FunctionLibraryRuntime* GetFLR(const string& device_name);
+  FunctionLibraryRuntime* GetFLR(const string& device_name) const;
 
   // Returns the device incarnation for the given device_name.
   Status GetDeviceIncarnation(const string& device_name, int64* incarnation);
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index 0234d4c37250d8ed3c645759dd17f94093e57df0..4dbe113e44ee0b7a6eba44ace3c1ff8daa17059f 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -60,8 +60,8 @@ const string RegisteredFactoriesErrorMessageLocked() {
                          str_util::Join(factory_types, ", "), "}.");
 }
 string SessionOptionsToString(const SessionOptions& options) {
-  return strings::StrCat("target: \"", options.target, "\" config: ",
-                         ProtoShortDebugString(options.config));
+  return strings::StrCat("target: \"", options.target,
+                         "\" config: ", ProtoShortDebugString(options.config));
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index d7e01144c9ef3aa09ddd212947eafe48ccff555b..cb900db10af98496cfdfafa5a38296bfdc4e996b 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -226,22 +226,23 @@ void StepStatsCollector::BuildCostModel(
       if (node) {
         for (int i = 0; i < stats.output_size(); ++i) {
           const auto& output = stats.output(i);
-          cm->RecordMaxMemorySize(node, i, Bytes(output.tensor_description()
-                                                     .allocation_description()
-                                                     .allocated_bytes()),
+          cm->RecordMaxMemorySize(node, i,
+                                  Bytes(output.tensor_description()
+                                            .allocation_description()
+                                            .allocated_bytes()),
                                   stats.output(i).tensor_description().shape(),
                                   node->output_types()[i]);
-          cm->RecordAllocationId(node, i, output.tensor_description()
-                                              .allocation_description()
-                                              .allocation_id());
+          cm->RecordAllocationId(node, i,
+                                 output.tensor_description()
+                                     .allocation_description()
+                                     .allocation_id());
         }
         cm->RecordMemoryStats(node, stats.memory_stats());
         // Use hardware stats to record the execution time if they're available,
         // otherwise use the regular (less accurate) stats
         string node_name = dev_stats.regular_stats->node_stats(i).node_name();
-        if (dev_stats.hardware_stats &&
-            name_to_hw_node_stats.find(node_name) !=
-                name_to_hw_node_stats.end()) {
+        if (dev_stats.hardware_stats && name_to_hw_node_stats.find(node_name) !=
+                                            name_to_hw_node_stats.end()) {
           const NodeExecStats& hw_stats = name_to_hw_node_stats[node_name];
           cm->RecordMaxExecutionTime(
               node, Microseconds(hw_stats.op_end_rel_micros()));
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 9094824ee734a9398db5aca2a507af4acd07c26b..02bd8b8f3bc692728ce73176f6268d95f860dc9b 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -80,7 +80,7 @@ void SYCLAllocator::ClearStats() override {
 
 size_t SYCLAllocator::RequestedSize(void* ptr) {
   mutex_lock lock(mu_);
-  if(!sycl_device_) {
+  if (!sycl_device_) {
     return 0;
   }
   const auto& buffer = sycl_device_->get_sycl_buffer(ptr);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index cca9f92c62e2a4f4d57c8a6111b53dccee505f93..550f1933322420fc97da2bb588c719c73ea5ae4d 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -20,10 +20,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 #define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -56,14 +56,13 @@ class SYCLAllocator : public Allocator {
   // Clear the SYCL device used by the Allocator
   void ClearSYCLDevice() {
     mutex_lock lock(mu_);
-    if(sycl_device_) {
+    if (sycl_device_) {
       delete sycl_device_;
       sycl_device_ = nullptr;
     }
   }
 
  private:
-
   mutable mutex mu_;
   Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_);  // owned
   AllocatorStats stats_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index cc272d156ef67a4f4f93f35603ffe301d154932a..7c09e0b8f194c7dc8a594aa487ec62e00d5b5e39 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -187,9 +187,9 @@ class GSYCLInterface {
       type = "Unknown";
     }
 
-    return strings::StrCat("id: ", device_id, ", type: ", type, ", name: ",
-                           name.c_str(), ", vendor: ", vendor.c_str(),
-                           ", profile: ", profile.c_str());
+    return strings::StrCat(
+        "id: ", device_id, ", type: ", type, ", name: ", name.c_str(),
+        ", vendor: ", vendor.c_str(), ", profile: ", profile.c_str());
   }
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 19c14770dcad7a3ca045ccb4ff68189c943d8cff..14f7727659d91db2373a1ac8ad0e46258cc32fbe 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -26,7 +26,6 @@ class SYCLDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions &options, const string &name_prefix,
                        std::vector<Device *> *devices) override {
-
     auto syclInterface = GSYCLInterface::instance();
 
     size_t n = 1;
@@ -37,13 +36,11 @@ class SYCLDeviceFactory : public DeviceFactory {
 
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(
-          new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality()
-                         , syclInterface->GetShortDeviceDescription(i)
-                         , syclInterface->GetSYCLAllocator(i)
-                         , syclInterface->GetCPUAllocator(i)
-                         , syclInterface->GetSYCLContext(i))
-                       );
+      devices->push_back(new SYCLDevice(
+          options, name, Bytes(256 << 20), DeviceLocality(),
+          syclInterface->GetShortDeviceDescription(i),
+          syclInterface->GetSYCLAllocator(i), syclInterface->GetCPUAllocator(i),
+          syclInterface->GetSYCLContext(i)));
     }
 
     return Status::OK();
@@ -51,6 +48,6 @@ class SYCLDeviceFactory : public DeviceFactory {
 };
 
 REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
-}
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
index 83016b706a57033bfdaec932f763bc118434db90..3124ed23c92eb542e90e6c077fc703fb84b38a18 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_util.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_util.h
@@ -20,8 +20,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 
-#include "tensorflow/core/common_runtime/device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
 // For DMA helper
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 616ced3d0f3d9cfed683120e792b40eb9010fe06..2e1aabd1cc8066df6a5f7e6dd0aa27c6a16ef614 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -24,31 +24,31 @@ limitations under the License.
 namespace tensorflow {
 
 DebugGateway::DebugGateway(DirectSession* session) : session_(session) {
-  session_->node_outputs_callback_ = [this](
-      const string& node_name, const int output_slot, const Tensor* tensor,
-      const bool is_ref, OpKernelContext* ctx) {
-    if (comp_cb_ != nullptr && output_slot <= 0) {
-      // The node completion callback is invoked once for a node regardless
-      // of whether the node has zero, one or more outputs.
-      // The output_slot can be negative (-1, or kControlSlot) if
-      // node_outputs_callback_ is invoked for a node with no output. If that
-      // is the case, notify the callback that the node in question has no
-      // output.
-      comp_cb_(node_name, output_slot == 0);
-    }
-
-    // Copy tensor values (e.g., from GPU to host) only if the
-    // value callback is not nullptr.
-    if (val_cb_ != nullptr && output_slot >= 0) {
-      CopyTensor(
-          node_name, output_slot, tensor, ctx,
-          [this, node_name, output_slot, is_ref](const Tensor* copied_tensor) {
-            val_cb_(node_name, output_slot, *copied_tensor, is_ref);
-          });
-    }
-
-    return Status::OK();
-  };
+  session_->node_outputs_callback_ =
+      [this](const string& node_name, const int output_slot,
+             const Tensor* tensor, const bool is_ref, OpKernelContext* ctx) {
+        if (comp_cb_ != nullptr && output_slot <= 0) {
+          // The node completion callback is invoked once for a node regardless
+          // of whether the node has zero, one or more outputs.
+          // The output_slot can be negative (-1, or kControlSlot) if
+          // node_outputs_callback_ is invoked for a node with no output. If
+          // that is the case, notify the callback that the node in question has
+          // no output.
+          comp_cb_(node_name, output_slot == 0);
+        }
+
+        // Copy tensor values (e.g., from GPU to host) only if the
+        // value callback is not nullptr.
+        if (val_cb_ != nullptr && output_slot >= 0) {
+          CopyTensor(node_name, output_slot, tensor, ctx,
+                     [this, node_name, output_slot,
+                      is_ref](const Tensor* copied_tensor) {
+                       val_cb_(node_name, output_slot, *copied_tensor, is_ref);
+                     });
+        }
+
+        return Status::OK();
+      };
 }
 
 DebugGateway::~DebugGateway() {
@@ -86,7 +86,8 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) &&
+    if ((device->name().find("GPU:") != string::npos ||
+         device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 57583349069a0b4deb137cb09564cdbb3909a4b0..b1bbd3f6980b16c13a1e5c9cd3a0f6c4bb8c1217 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -390,9 +390,9 @@ TEST_F(SessionDebugMinusAXTest,
   debug_gateway.SetNodeValueCallback(
       [this, &mu, &val_callback_count, &a_debug_identity_node_name,
        &x_debug_identity_node_name, &y_debug_identity_node_name,
-       &debug_identity_tensor_vals, &callbacks_done, &kConcurrentRuns](
-           const string& node_name, const int output_slot,
-           const Tensor& tensor_value, const bool is_ref) {
+       &debug_identity_tensor_vals, &callbacks_done,
+       &kConcurrentRuns](const string& node_name, const int output_slot,
+                         const Tensor& tensor_value, const bool is_ref) {
         mutex_lock l(mu);
 
         if (node_name == a_debug_identity_node_name && output_slot == 0) {
@@ -560,21 +560,21 @@ TEST_F(SessionDebugOutputSlotWithoutOutgoingEdgeTest,
   Notification callbacks_done;
 
   std::vector<Tensor> debug_identity_tensor_vals;
-  debug_gateway.SetNodeValueCallback([this, &mu, &callbacks_done,
-                                      &debug_identity_node_name,
-                                      &debug_identity_tensor_vals](
-      const string& node_name, const int output_slot,
-      const Tensor& tensor_value, const bool is_ref) {
-    mutex_lock l(mu);
+  debug_gateway.SetNodeValueCallback(
+      [this, &mu, &callbacks_done, &debug_identity_node_name,
+       &debug_identity_tensor_vals](
+          const string& node_name, const int output_slot,
+          const Tensor& tensor_value, const bool is_ref) {
+        mutex_lock l(mu);
 
-    if (node_name == debug_identity_node_name && output_slot == 0) {
-      debug_identity_tensor_vals.push_back(tensor_value);
+        if (node_name == debug_identity_node_name && output_slot == 0) {
+          debug_identity_tensor_vals.push_back(tensor_value);
 
-      if (!callbacks_done.HasBeenNotified()) {
-        callbacks_done.Notify();
-      }
-    }
-  });
+          if (!callbacks_done.HasBeenNotified()) {
+            callbacks_done.Notify();
+          }
+        }
+      });
 
   // Add DebugIdentity watch on c:0, which does not have an outgoing edge.
   RunOptions run_opts;
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index a312f789d8444360a0892faa4b3a0f9a0bdf7a32..f70931e926507c72287588da278a3b8d6bb19122 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -30,7 +30,7 @@ namespace test {
 
 ::grpc::Status TestEventListenerImpl::SendEvents(
     ::grpc::ServerContext* context,
-    ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>*
+    ::grpc::ServerReaderWriter<::tensorflow::EventReply, ::tensorflow::Event>*
         stream) {
   Event event;
 
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index f81445c20bd2ba56a6d7d3bb4ddefc71f5199784..baa8c08fdf1508cd599d4c9523b06954280a609d 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -574,8 +574,6 @@ Status DebugIO::CloseDebugURL(const string& debug_url) {
   }
 }
 
-static Status CloseDebugURL(const string& debug_url) { return Status::OK(); }
-
 Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                     const Tensor& tensor,
                                     const uint64 wall_time_us,
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 2f83c2415b831cc1a2b90d4e6a2046218e6fe5f6..0807a85b8b39cf8bf479227bd6b6bd581e2ba9b0 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -57,7 +57,8 @@ class DebugIOUtilsTest : public ::testing::Test {
 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
   DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
                               "hidden_1/MatMul", 0, "DebugIdentity");
-  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name);
+  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2",
+            debug_node_key.device_name);
   EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
   EXPECT_EQ(0, debug_node_key.output_slot);
   EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index f4ee841032bf2b78b70fd446a6e4679bd9c943f1..9e152aa0823b67fceb7f103cc6e090f00870f88a 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index d0ca2a625778ff73c6d40492cc5d02ec81ef3cc6..cc35264b8fe0b6decc325dab793c6a5fe6ad097f 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -140,7 +140,7 @@ class GraphMgr {
     GraphMgr* graph_mgr;
   };
 
-  const WorkerEnv* worker_env_;             // Not owned.
+  const WorkerEnv* worker_env_;  // Not owned.
   DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index d1dc622ce79df1a98c3712e447a66bad3baecba1..1a488303ac73b8628b9d3fe4050ad9144724348e 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -528,8 +528,8 @@ void Master::ListDevices(const ListDevicesRequest* req,
       auto session = FindMasterSession(req->session_handle());
       if (session == nullptr) {
         done(errors::InvalidArgument(
-             "Session ", req->session_handle(),
-             " is not found. Possibly, this master has restarted."));
+            "Session ", req->session_handle(),
+            " is not found. Possibly, this master has restarted."));
         return;
       }
       core::ScopedUnref ref(session);
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index dcc25e4426df386da2543f76239a1468af4bc3d2..9d4a1eb8a1b4a74ee06057c621bfe5334d1609dd 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1448,6 +1448,8 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const auto count = run_state->count;
     pss.collect_timeline =
         req.options().trace_level() == RunOptions::FULL_TRACE;
+    pss.collect_rpcs =
+            req.options().trace_level() == RunOptions::FULL_TRACE;
     pss.report_tensor_allocations_upon_oom =
         req.options().report_tensor_allocations_upon_oom();
 
@@ -1610,6 +1612,8 @@ Status MasterSession::DoRunWithLocalExecution(
   TRACEPRINTF("stepid %llu", step_id);
 
   pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
+  pss.collect_rpcs =
+          req.options().trace_level() == RunOptions::FULL_TRACE;
   pss.report_tensor_allocations_upon_oom =
       req.options().report_tensor_allocations_upon_oom();
   // Build the cost model every 'build_cost_model_every' steps after skipping an
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 121c58762f10a87fea059ce43b190f70e49e1f64..f2c1f3489c388d6a5fff729b1c8f98136532105c 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -61,7 +61,7 @@ class MasterTest : public ::testing::Test {
   // rpc calls.
 
   Status CreateSession(const GraphDef& def, string* handle,
-                            int64* initial_version) {
+                       int64* initial_version) {
     ::grpc::ClientContext ctx;
     CreateSessionRequest req;
     *(req.mutable_graph_def()) = def;
@@ -77,7 +77,7 @@ class MasterTest : public ::testing::Test {
   }
 
   Status ExtendSession(const string& handle, const GraphDef& def,
-                            int64 current_version, int64* new_version) {
+                       int64 current_version, int64* new_version) {
     ::grpc::ClientContext ctx;
     ExtendSessionRequest req;
     req.set_session_handle(handle);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index ac279937730466514451d7e81257d2110e128eff..b4d18d8607eaddd75f4e395e71fbd75554645a61 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -185,23 +185,22 @@ class GrpcMasterService : public AsyncServiceInterface {
     MutableRunStepResponseWrapper* wrapped_response =
         new NonOwnedProtoRunStepResponse(&call->response);
     call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-    master_impl_->RunStep(call_opts, wrapped_request, wrapped_response,
-                          [call, call_opts, wrapped_request, wrapped_response,
-                           trace](const Status& status) {
-                            call->ClearCancelCallback();
-                            delete call_opts;
-                            delete wrapped_request;
-                            delete trace;
-                            if (call->request.store_errors_in_response_body() &&
-                                !status.ok()) {
-                              call->response.set_status_code(status.code());
-                              call->response.set_status_error_message(
-                                  status.error_message());
-                              call->SendResponse(ToGrpcStatus(Status::OK()));
-                            } else {
-                              call->SendResponse(ToGrpcStatus(status));
-                            }
-                          });
+    master_impl_->RunStep(
+        call_opts, wrapped_request, wrapped_response,
+        [call, call_opts, wrapped_request, wrapped_response,
+         trace](const Status& status) {
+          call->ClearCancelCallback();
+          delete call_opts;
+          delete wrapped_request;
+          delete trace;
+          if (call->request.store_errors_in_response_body() && !status.ok()) {
+            call->response.set_status_code(status.code());
+            call->response.set_status_error_message(status.error_message());
+            call->SendResponse(ToGrpcStatus(Status::OK()));
+          } else {
+            call->SendResponse(ToGrpcStatus(status));
+          }
+        });
     ENQUEUE_REQUEST(RunStep, true);
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 4e203e260a1a370cc2bc7e40c3ce9e84da4d3ad4..6ae94b74417c3fb6c4da1589bb9f532cb6d79930 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -89,9 +89,9 @@ class MasterService final {
     ::grpc::Status ExtendSession(::grpc::ClientContext* context,
                                  const ExtendSessionRequest& request,
                                  ExtendSessionResponse* response) override;
-    ::grpc::Status PartialRunSetup(
-        ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
-        PartialRunSetupResponse* response) override;
+    ::grpc::Status PartialRunSetup(::grpc::ClientContext* context,
+                                   const PartialRunSetupRequest& request,
+                                   PartialRunSetupResponse* response) override;
     ::grpc::Status RunStep(::grpc::ClientContext* context,
                            const RunStepRequest& request,
                            RunStepResponse* response) override;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 70418f63686843414dca6c5ae4907ee263dc2904..1088e9be66ceb7fbddfaed0691423745f362343f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -69,8 +69,7 @@ class GrpcRemoteMaster : public MasterInterface {
     ::grpc::ClientContext ctx;
     auto trace = TraceRpc("RunStep/Client", &ctx);
     return Call(&ctx, call_options, &request->ToProto(),
-                get_proto_from_wrapper(response),
-                &MasterServiceStub::RunStep);
+                get_proto_from_wrapper(response), &MasterServiceStub::RunStep);
   }
 
   Status CloseSession(CallOptions* call_options,
@@ -114,8 +113,9 @@ class GrpcRemoteMaster : public MasterInterface {
   template <typename Request, typename Response>
   Status Call(::grpc::ClientContext* ctx, CallOptions* call_options,
               const Request* request, Response* response,
-              ::grpc::Status (MasterServiceStub::*pfunc)(
-                  ::grpc::ClientContext*, const Request&, Response*)) {
+              ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*,
+                                                         const Request&,
+                                                         Response*)) {
     ctx->set_fail_fast(false);
     SetDeadline(ctx, call_options->GetTimeout());
     return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
index 373eecffcab1dded60de7ffea96ba58208bb692c..5597ee7a76a55f125dd0db82eceb58f5e922ab13 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
@@ -21,11 +21,8 @@ namespace tensorflow {
 namespace test {
 
 // ErrorOp::Compute returns an error.
-REGISTER_OP("Error")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("message: string");
+REGISTER_OP("Error").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "message: string");
 class ErrorOp : public OpKernel {
  public:
   explicit ErrorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -66,11 +63,8 @@ REGISTER_KERNEL_BUILDER(Name("InvalidRefType").Device(DEVICE_CPU),
 
 // DelayOp::AsyncCompute sleeps for "micros"-econd and then returns
 // its input.
-REGISTER_OP("Delay")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("micros: int");
+REGISTER_OP("Delay").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "micros: int");
 class DelayOp : public AsyncOpKernel {
  public:
   explicit DelayOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 95811476f789be0225231f86aa0242db71b81199..b20e744a97160a17cd1621b38475a7c9c4f81d8f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -444,6 +444,24 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GrpcWorker::LoggingAsync(const LoggingRequest* request,
+                              LoggingResponse* response, StatusCallback done) {
+  auto env = this->env();
+  if (env) {
+    auto session_mgr = (SessionMgr*)env->session_mgr;
+    if (session_mgr) {
+      session_mgr->SetLogging(request->rpc_logging());
+      for (const auto& step_id : request->fetch_step_id()) {
+        session_mgr->RetrieveLogs(step_id, response);
+      }
+      if (request->clear()) {
+        session_mgr->ClearLogs();
+      }
+    }
+  }
+  done(Status::OK());
+}
+
 WorkerEnv* GrpcWorker::env() { return env_; }
 
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 78a21fd9f6ecb6deac171bb5c4a16fa074988fa2..3954af8ad8649471277bb089d4e00b7c7bfaa662 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -40,6 +40,9 @@ class GrpcWorker : public Worker {
                                    ::grpc::ByteBuffer* response,
                                    StatusCallback done);
 
+  virtual void LoggingAsync(const LoggingRequest* request,
+                    LoggingResponse* response, StatusCallback done);
+
   WorkerEnv* env();
 
  private:
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index b2668fae25a8a6bc60b37ddfaa83b8b523c3a6f5..d3af7417e61105c788b8029c84c222e49a0d2830 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -184,8 +184,8 @@ static void BM_Helper(int iters, int width, int num_stages, int tensor_size,
 
   testing::SetLabel(
       strings::StrCat(def.node_size(), " nodes; ",
-             use_multiple_devices ? "Multi device" : "Single device",
-             "; tensor bytes/send: ", tensor_size * sizeof(float)));
+                      use_multiple_devices ? "Multi device" : "Single device",
+                      "; tensor bytes/send: ", tensor_size * sizeof(float)));
 
   std::vector<Tensor> outputs;
 
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 4766f4c33b654481f7d99ab82939e33e77564771..9dae5b3b926fab14c2b36955436d3956baa29fdd 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <queue>
 
-#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index eabcaccdd1e6c1a732f8871bc9da6265bd9a8dd8..ef87b9834dba50cf628a8c29c70b0266661d6227 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 
-#include <functional>
 #include <deque>
+#include <functional>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/costmodel.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 8db49e7f151517a51de1f64242031a8bd9bd96e6..51b9547f53ba687c863b0fd11647e7bb82d80e03 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -43,8 +43,8 @@ SessionMgr::SessionMgr(
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
-  return strings::StrCat("/job:", server_def.job_name(),
-                         "/replica:0/task:", server_def.task_index());
+  return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
+                         server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const string& session,
@@ -64,8 +64,13 @@ Status SessionMgr::CreateSession(const string& session,
     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
   }
 
+  if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
+    worker_cache->SetLogging(this->is_logging_active_);
+  }
+
   CHECK(!worker_env_->local_devices.empty())
       << "The WorkerEnv must have at least one device in `local_devices`.";
+
   std::vector<Device*> renamed_devices;
   for (Device* d : worker_env_->local_devices) {
     renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
@@ -113,4 +118,77 @@ std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
   return legacy_session_;
 }
 
+void SessionMgr::SetLogging(bool active) {
+  mutex_lock l(mu_);
+  this->is_logging_active_ = active;
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->SetLogging(active);
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->SetLogging(active);
+      }
+    }
+  }
+}
+
+void SessionMgr::RetrieveLogs(tensorflow::int64 step_id,
+                              LoggingResponse* response) {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      auto step_stats = StepStats();
+      if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+        auto* labeled_step_stats = response->add_step();
+        labeled_step_stats->set_step_id(step_id);
+        labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+      }
+    }
+  }
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        auto step_stats = StepStats();
+        if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+          auto* labeled_step_stats = response->add_step();
+          labeled_step_stats->set_step_id(step_id);
+          labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+        }
+      }
+    }
+  }
+}
+
+void SessionMgr::ClearLogs() {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->ClearLogs();
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->ClearLogs();
+      }
+    }
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 3ce260d12e92e3458fe12f3f5b5723f9c39b5f4b..4c9702d522cede454d5efd15669eaec2b0c1c1b1 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
@@ -56,6 +57,12 @@ class SessionMgr {
 
   static string WorkerNameFromServerDef(const ServerDef& server_def);
 
+  void SetLogging(bool active);
+
+  void RetrieveLogs(tensorflow::int64 step_id, LoggingResponse* response);
+
+  void ClearLogs();
+
  private:
   const WorkerEnv* const worker_env_;  // Not owned.
 
@@ -75,6 +82,8 @@ class SessionMgr {
   std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
   std::shared_ptr<WorkerSession> legacy_session_;
 
+  bool is_logging_active_ = false;
+
   const WorkerCacheFactory worker_cache_factory_;
 
   std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index fe2d1a12934dde814344b70f52fbc972f74347e0..34a4013547b5feef12b49198bff4e733f1b9e932 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -81,7 +81,7 @@ void TensorResponse::InitPartial(const RecvTensorResponse& response) {
 Status TensorResponse::ParseFrom(Source* source) {
   if (!on_host_) {
     protobuf::io::CodedInputStream input(source->contents());
-    input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+    input.SetTotalBytesLimit(INT_MAX);  // Unlimited
 
     // Pre-parse into local storage, then delegate to device.
     if (!meta_.ParseFromCodedStream(&input) || !input.ConsumedEntireMessage()) {
@@ -217,7 +217,7 @@ bool TensorResponse::ParseTensorSubmessage(
 
 bool TensorResponse::ParseFast(Source* source) {
   protobuf::io::CodedInputStream input(source->contents());
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+  input.SetTotalBytesLimit(INT_MAX);  // Unlimited
   while (true) {
     auto p = input.ReadTagWithCutoff(127);
     int tag = GetTagFieldNumber(p.first);
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 702af78c88014d54fe2f72a8266e5e7e43b3cfb9..95ca3c3b4d11fac0d103eb52f19d5b0b2f4ad3ea 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -97,9 +97,8 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
                                            const string& tensor_name,
                                            const string& src_device,
                                            const string& dst_device,
-                                           int64 bytes,
-                                           const string& details,
-                                           const string& transfer_method_name){
+                                           int64 bytes, const string& details,
+                                           const string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
   if (details.empty()) {
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index 0efe43fde2dadd42aa03d3bf2968d2cbfb113e8d..6025be517048d33b20f7af15ef7ad1339adebdf9 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -21,13 +21,13 @@ void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p += 2, q++, size--) {
-      *q = p[0];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[0];
+  }
 #else
-    for (; size != 0; p += 2, q++, size--) {
-     *q = p[1];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[1];
+  }
 #endif
 }
 
@@ -35,15 +35,15 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = *p;
-      q[1] = 0;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = *p;
+    q[1] = 0;
+  }
 #else
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = 0;
-      q[1] = *p;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = 0;
+    q[1] = *p;
+  }
 #endif
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 7ab8e3ec188a223e35b47b6f9517abd9327b23f8..8bb87483e1cfea6a411b6678f065f0fb59b4d91c 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1356,10 +1356,11 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", num_outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ",
-            num_outer_dims, " dimensions of updates.shape=",
-            c->DebugString(updates_shape), ": ", s.error_message());
+            "The outer ", num_outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", num_outer_dims,
+            " dimensions of updates.shape=", c->DebugString(updates_shape),
+            ": ", s.error_message());
       }
 
       ShapeHandle input_suffix;
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 2c2c7e7c585c9364e1d08280d5fe76f1bf1eff23..f866183f61b1d5445fd7229ba1bc294d495a4f67 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_DATASET_H_
 #define TENSORFLOW_FRAMEWORK_DATASET_H_
 
+#include "tensorflow/core/lib/core/status.h"
+
 namespace tensorflow {
 namespace dataset {
 // Registry for stateful ops that need to be used in dataset functions.
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index ad301a8aa4ba4be5b7031d00984d8e6febf1583e..70d1e20a17c6cbf75a15d32a97216f6a1354ccf4 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -104,8 +104,8 @@ Status FakeInputImpl::AddInputToBuilder() {
       Status status = GetNodeAttr(*node_def_, arg_->type_list_attr(), &dts);
       if (!status.ok()) {
         return errors::InvalidArgument(
-            "Could not infer list of types for input '", arg_->name(), "': ",
-            status.error_message());
+            "Could not infer list of types for input '", arg_->name(),
+            "': ", status.error_message());
       }
       SourceList(dts);
       return Status::OK();
@@ -131,8 +131,8 @@ Status FakeInputImpl::GetN(int* n) const {
     Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n);
     if (!status.ok()) {
       return errors::InvalidArgument("Could not infer length of input '",
-                                     arg_->name(), "': ",
-                                     status.error_message());
+                                     arg_->name(),
+                                     "': ", status.error_message());
     }
   }
   return Status::OK();
@@ -153,8 +153,8 @@ Status FakeInputImpl::GetDataType(DataType* dt) const {
         *dt = attr->default_value().type();
       } else {
         return errors::InvalidArgument("Could not infer type for input '",
-                                       arg_->name(), "': ",
-                                       status.error_message());
+                                       arg_->name(),
+                                       "': ", status.error_message());
       }
     }
   } else {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 0224f252270cdfb856957be33b3dd857ecb07ec9..d6b576166cafb4a70a24a2a96db12679f5ea644d 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1264,8 +1264,8 @@ FunctionDef FunctionDefHelper::Define(const string& name,
     }
     for (const string& a : src.arg) {
       const auto iter = ret_index.find(a);
-      CHECK(iter != ret_index.end()) << "Node input '" << a << "' in '"
-                                     << src.ret[0] << "' of " << name;
+      CHECK(iter != ret_index.end())
+          << "Node input '" << a << "' in '" << src.ret[0] << "' of " << name;
       n->add_input(iter->second);
     }
     for (const string& d : src.dep) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 3bb5638cdf232c144157b587a7431f435e2fa6ea..b933ee0b0e4009b1568d7465ca28d4b4f9a018e6 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -656,7 +656,7 @@ bool RegisterOp(const string& op, Creator func);
 // Returns OK the gradient creator for the "op" is found (may be
 // nullptr if REGISTER_OP_NO_GRADIENT is used.
 Status GetOpGradientCreator(const string& op, Creator* creator);
-};
+};  // namespace gradient
 
 // Declare explicit instantiations of GetAttr
 #define GET_ATTR(T)                                          \
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index bd018b7243897a5b45aa35d7fb94ca1ee1b12e75..1f670535d575e9bbc4196fb1f1e1c381d33ae204 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -35,8 +35,8 @@ namespace tensorflow {
 
 string SummarizeGraphDef(const GraphDef& graph_def) {
   string ret;
-  strings::StrAppend(&ret, "versions = ",
-                     ProtoShortDebugString(graph_def.versions()), ";\n");
+  strings::StrAppend(
+      &ret, "versions = ", ProtoShortDebugString(graph_def.versions()), ";\n");
   for (const NodeDef& node : graph_def.node()) {
     strings::StrAppend(&ret, SummarizeNodeDef(node), ";\n");
   }
@@ -90,9 +90,9 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
           FindAttr(attr.first, *producer_op_def);
       if (producer_attr_def == nullptr) {
         return errors::InvalidArgument(
-            "Attr '", attr.first, "' missing in producer's OpDef: ",
-            SummarizeOpDef(*producer_op_def), " but found in node: ",
-            SummarizeNodeDef(*node_def));
+            "Attr '", attr.first,
+            "' missing in producer's OpDef: ", SummarizeOpDef(*producer_op_def),
+            " but found in node: ", SummarizeNodeDef(*node_def));
       }
       // ...and it has the same value as the default in producer,
       if (producer_attr_def->has_default_value() &&
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index a4e8add6c49b823948eb5978f99239bb4d9b52ef..2d035ab90d0f4493f6b6f572d0dd8550f5098e7e 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -170,20 +170,20 @@ const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def) {
   return nullptr;
 }
 
-#define VALIDATE(EXPR, ...)                                          \
-  do {                                                               \
-    if (!(EXPR)) {                                                   \
-      return errors::InvalidArgument(__VA_ARGS__, "; in OpDef: ",    \
-                                     ProtoShortDebugString(op_def)); \
-    }                                                                \
+#define VALIDATE(EXPR, ...)                                            \
+  do {                                                                 \
+    if (!(EXPR)) {                                                     \
+      return errors::InvalidArgument(                                  \
+          __VA_ARGS__, "; in OpDef: ", ProtoShortDebugString(op_def)); \
+    }                                                                  \
   } while (false)
 
 static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
                           bool output, std::set<string>* names) {
   const string suffix = strings::StrCat(
       output ? " for output '" : " for input '", arg.name(), "'");
-  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()), "Duplicate name: ",
-           arg.name());
+  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()),
+           "Duplicate name: ", arg.name());
   VALIDATE(HasAttrStyleType(arg), "Missing type", suffix);
 
   if (!arg.number_attr().empty()) {
@@ -250,8 +250,8 @@ Status ValidateOpDef(const OpDef& op_def) {
   std::set<string> names;  // for detecting duplicate names
   for (const auto& attr : op_def.attr()) {
     // Validate name
-    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()), "Duplicate name: ",
-             attr.name());
+    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()),
+             "Duplicate name: ", attr.name());
     DataType dt;
     VALIDATE(!DataTypeFromString(attr.name(), &dt), "Attr can't have name ",
              attr.name(), " that matches a data type");
@@ -680,8 +680,8 @@ Status OpDefAddedDefaultsUnchanged(const OpDef& old_op,
     if (!penultimate_attr.has_default_value() ||
         !new_attr->has_default_value()) {
       return errors::InvalidArgument("Missing default for attr '",
-                                     penultimate_attr.name(), "' in op: ",
-                                     SummarizeOpDef(new_op));
+                                     penultimate_attr.name(),
+                                     "' in op: ", SummarizeOpDef(new_op));
     }
 
     // Actually test that the attr's default value hasn't changed.
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 28809c11c58704479c9c45b1de96dffef3d575bd..2b9812d4fcbc145540155959b19dd37cf902c1a2 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -200,10 +200,11 @@ TEST_F(ValidateOpDefTest, BadAttrDefault) {
                           "default_value { list { s: ['foo'] } } }"),
                 "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
                 "'BadAttrDef'");
-  ExpectFailure(TestBuilder(OpDefBuilder("GoodAttrDef")
-                                .Attr("a: list(type) >=2 = [DT_STRING]")),
-                "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
-                "'GoodAttrDef'");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("GoodAttrDef").Attr("a: list(type) >=2 = [DT_STRING]")),
+      "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
+      "'GoodAttrDef'");
 }
 
 TEST_F(ValidateOpDefTest, NoRefTypes) {
@@ -213,9 +214,10 @@ TEST_F(ValidateOpDefTest, NoRefTypes) {
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrDef").Attr("T: type = DT_INT32_REF")),
       "AttrValue must not have reference type value of int32_ref");
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrDef")
-                                .Attr("T: list(type) = [DT_STRING_REF]")),
-                "AttrValue must not have reference type value of string_ref");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrDef").Attr("T: list(type) = [DT_STRING_REF]")),
+      "AttrValue must not have reference type value of string_ref");
 }
 
 TEST_F(ValidateOpDefTest, BadAttrMin) {
@@ -245,9 +247,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: numbertype = DT_INT32")));
   // Not in list of allowed types.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: numbertype = DT_STRING")),
-                "attr 'x' of string is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: numbertype = DT_STRING")),
+      "attr 'x' of string is not in the list of allowed values");
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrtude")
                       .Attr("x: list(realnumbertype) = [DT_COMPLEX64]")),
@@ -260,9 +263,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: {'foo', 'bar'} = 'bar'")));
   // Not in list of allowed strings.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: {'foo', 'bar'} = 'baz'")),
-                "attr 'x' of \"baz\" is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: {'foo', 'bar'} = 'baz'")),
+      "attr 'x' of \"baz\" is not in the list of allowed values");
   ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
                                 .Attr("x: list({'foo', 'bar'}) = ['baz']")),
                 "attr 'x' of \"baz\" is not in the list of allowed values");
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index e78b6ab5d977c6ea2f0dec66988432a617154916..5f2eb9d99ab11f9862bd277d93af61c05e2517f4 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -266,35 +266,6 @@ static void StringReplace(const string& from, const string& to, string* s) {
   *s = str_util::Join(split, to.c_str());
 }
 
-static void RenameInDocs(const string& from, const string& to, OpDef* op_def) {
-  const string from_quoted = strings::StrCat("`", from, "`");
-  const string to_quoted = strings::StrCat("`", to, "`");
-  for (int i = 0; i < op_def->input_arg_size(); ++i) {
-    if (!op_def->input_arg(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_input_arg(i)->mutable_description());
-    }
-  }
-  for (int i = 0; i < op_def->output_arg_size(); ++i) {
-    if (!op_def->output_arg(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_output_arg(i)->mutable_description());
-    }
-  }
-  for (int i = 0; i < op_def->attr_size(); ++i) {
-    if (!op_def->attr(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_attr(i)->mutable_description());
-    }
-  }
-  if (!op_def->summary().empty()) {
-    StringReplace(from_quoted, to_quoted, op_def->mutable_summary());
-  }
-  if (!op_def->description().empty()) {
-    StringReplace(from_quoted, to_quoted, op_def->mutable_description());
-  }
-}
-
 static void RenameInDocs(const string& from, const string& to,
                          ApiDef* api_def) {
   const string from_quoted = strings::StrCat("`", from, "`");
@@ -325,7 +296,6 @@ static void RenameInDocs(const string& from, const string& to,
   }
 }
 
-
 namespace {
 
 // Initializes given ApiDef with data in OpDef.
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index 94fe194a1a5032b472259d26145ba7cd4460191c..ff38e4b22141a7f1b7212a516ec5adbd5c7aad79 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -47,7 +47,6 @@ string PBTxtToMultiline(StringPiece pbtxt,
                         const std::vector<string>& multi_line_fields);
 string PBTxtFromMultiline(StringPiece multiline_pbtxt);
 
-
 // Takes a list of files with ApiDefs text protos, and allows you to
 // look up the specific ApiDef for any given op.
 class ApiDefMap {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index aee3a0afbca23a180d5415fef2b1b405f23b3f53..fd2d06be9899852fa8ed61b2fdc4373ca4c0310e 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -101,7 +101,8 @@ OpKernel::OpKernel(OpKernelConstruction* context)
 
   // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
+               context->device_type() != DeviceType(DEVICE_SYCL);
 }
 
 OpKernel::~OpKernel() {}
@@ -943,13 +944,6 @@ Status FindKernelRegistration(const DeviceType& device_type,
   return Status::OK();
 }
 
-Status FindKernelRegistration(const DeviceType& device_type, const Node& node,
-                              const KernelRegistration** reg,
-                              bool* was_attr_mismatch) {
-  return FindKernelRegistration(device_type, node.def(), reg,
-                                was_attr_mismatch);
-}
-
 }  // namespace
 
 // TODO(irving): Change const NodeDef& to const Node&
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 94a9d1335a7c46372e05633431427d44fc46e027..b53b877f28d2c80e969fb418aa316ad96c6e2eaa 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -510,10 +510,9 @@ TEST_F(OpKernelBuilderTest, BuilderBoth) {
 }
 
 REGISTER_OP("BuildTypeAttr").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeAttr").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
   ExpectSuccess("BuildTypeAttr", DEVICE_CPU, {"T|type|DT_FLOAT"});
@@ -525,10 +524,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
 }
 
 REGISTER_OP("BuildTypeListAttr").Attr("T: list(type)");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeListAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<bool>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeListAttr").Device(DEVICE_CPU).TypeConstraint<bool>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
   ExpectSuccess("BuildTypeListAttr", DEVICE_CPU, {"T|list(type)|[]"});
@@ -574,14 +572,12 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
 }
 
 REGISTER_OP("DuplicateKernelForT").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index b8c771a0a1955b29f78478f60972b22d804351b2..f84ef0f953cf23e3fb2af210706586f95cfbb8ad 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -178,9 +178,9 @@ void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
           " must set *at_end=true, *produced=true, or return an error.");
     }
     if (!status.ok() && produced) {
-      status = errors::Internal("ReadLocked() for ", name(),
-                                " set *produced=true *and* returned an error: ",
-                                status.ToString());
+      status = errors::Internal(
+          "ReadLocked() for ", name(),
+          " set *produced=true *and* returned an error: ", status.ToString());
     }
     if (status.ok() && at_end) {
       status = OnWorkFinishedLocked();
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index edc93aec7f801b77a5c7867589f9d89ff7b6ea8f..17d16c9b8d6871794dc0d048e0fe230b4e6ad1e6 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -53,7 +53,7 @@ limitations under the License.
 */
 
 #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION) || \
-    defined(NVIDIA_TEGRA)
+    defined(ANDROID_TEGRA)
 
 // All types are supported, so all macros are invoked.
 //
@@ -211,14 +211,12 @@ limitations under the License.
 #define TF_CALL_SYCL_double(m)
 #else  // TENSORFLOW_SYCL_NO_DOUBLE
 #define TF_CALL_SYCL_double(m) TF_CALL_double(m)
-#endif // TENSORFLOW_SYCL_NO_DOUBLE
+#endif  // TENSORFLOW_SYCL_NO_DOUBLE
 
 #ifdef __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)  TF_CALL_float(m)
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m)
 #else  // __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)    \
-    TF_CALL_float(m)                    \
-    TF_CALL_SYCL_double(m)
-#endif // __ANDROID_TYPES_SLIM__
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m)
+#endif  // __ANDROID_TYPES_SLIM__
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index c1fe5517c6986838a07f67c0f2fa5474f89ffa33..ab35c2f0951d21e63fe06e378461c019e45495f1 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -23,7 +23,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -79,7 +79,7 @@ template <>
 struct proxy_type_pod<SYCLDevice, 4> {
   typedef float type;
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 /// If POD we use proxy_type_pod, otherwise this maps to identiy.
 template <typename Device, typename T>
@@ -99,7 +99,7 @@ struct proxy_type {
 #ifdef TENSORFLOW_USE_SYCL
 #define TF_CALL_SYCL_PROXY_TYPES(m) \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 32b8ad784d5228a40a073d166f33972def380280..de148f0bd3474421c1361cf7ae4aa681107aa883 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -69,9 +69,7 @@ class LocalRendezvousTest : public ::testing::Test {
     rendez_ = NewLocalRendezvous();
   }
 
-  ~LocalRendezvousTest() override {
-    rendez_->Unref();
-  }
+  ~LocalRendezvousTest() override { rendez_->Unref(); }
 
   void SchedClosure(std::function<void()> fn) {
     threads_.Schedule(std::move(fn));
@@ -99,8 +97,8 @@ string V(const Tensor& tensor) {
 
 Rendezvous::ParsedKey MakeKey(const string& name) {
   string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/device:GPU:0", name,
-                                   FrameAndIter(0, 0));
+                                   "/job:mnist/replica:1/task:2/device:GPU:0",
+                                   name, FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
   return k;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index d552ec1693f89a6695609681f2e8bffa9d78f93c..e3cc848a169bd848b8f3617d552938ba1ced3663 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -32,7 +32,7 @@ class ShapeRefinerTest;
 namespace grappler {
 class GraphProperties;
 class SymbolicShapeManager;
-}
+}  // namespace grappler
 
 namespace shape_inference {
 
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index a9b63ca60e4574bb0d59c4b939ac157e62f317e8..f48a7b9c47df3cfa93434ccf585dda8c5a29a2ba 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -760,7 +760,10 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   NodeDef def;
   InferenceContext c(kVersion, &def, MakeOpDef(4, 2),
                      {
-                         Unknown(), S({-1, 2}), S({1, -1, 3}), S({2, 4}),
+                         Unknown(),
+                         S({-1, 2}),
+                         S({1, -1, 3}),
+                         S({2, 4}),
                      },
                      {}, {}, {});
 
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index d8a9c0bac5b950157044dae07771b6733481ac9e..d7517bb311d517351f4dd2a59438780482485dff 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -582,7 +582,8 @@ TEST(TensorShapeTest, Large) {
 TEST(TensorShapeTest, Overflow) {
   int64 one = 1;
   std::vector<std::vector<int64>> overflows = {
-      {1 << 30, 1 << 30, 1 << 30}, {1 << 5, (one << 60) + 1},
+      {1 << 30, 1 << 30, 1 << 30},
+      {1 << 5, (one << 60) + 1},
   };
   for (const auto& overflow : overflows) {
     TensorShapeProto proto;
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index a8d141230093152397c792588a716c00556df77d..8f480d65f25012b858d7d375196b2693d3a533b9 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cmath>
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include <cmath>
 
 namespace tensorflow {
 namespace test {
diff --git a/tensorflow/core/framework/tensor_types.h b/tensorflow/core/framework/tensor_types.h
index 921f88dc0ba09e7904333613b728021751d5425c..a5c1a56bfc06a9785f08c468f78bda5111e15409 100644
--- a/tensorflow/core/framework/tensor_types.h
+++ b/tensorflow/core/framework/tensor_types.h
@@ -25,7 +25,8 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
 struct TTypes {
   // Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Tensor;
+                           Eigen::Aligned>
+      Tensor;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstTensor;
@@ -33,35 +34,42 @@ struct TTypes {
   // Unaligned Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> >
       UnalignedTensor;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType> >
+      UnalignedConstTensor;
 
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
-                           Eigen::Aligned> Tensor32Bit;
+                           Eigen::Aligned>
+      Tensor32Bit;
 
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<
       Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
-      Eigen::Aligned> Scalar;
+      Eigen::Aligned>
+      Scalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> ConstScalar;
+                           Eigen::Aligned>
+      ConstScalar;
 
   // Unaligned Scalar tensor of scalar type T.
-  typedef Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> > UnalignedScalar;
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> >
+      UnalignedScalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType> >
       UnalignedConstScalar;
 
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Flat;
+                           Eigen::Aligned>
+      Flat;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Vec;
+                           Eigen::Aligned>
+      Vec;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstVec;
@@ -69,16 +77,19 @@ struct TTypes {
   // Unaligned Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedFlat;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstFlat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedVec;
   typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> > UnalignedConstVec;
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstVec;
 
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Matrix;
+                           Eigen::Aligned>
+      Matrix;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstMatrix;
@@ -86,8 +97,9 @@ struct TTypes {
   // Unaligned Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> >
       UnalignedMatrix;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstMatrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType> >
+      UnalignedConstMatrix;
 };
 
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 5ddc9865633623561760bbcb06d1edf4eecec7a6..60f2b4135a68c4eed618e3efb07758fbab85fa07 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -70,8 +70,8 @@ TEST(TypesTest, kDataTypeRefOffset) {
       << "Extra reference enum "
       << enum_descriptor->FindValueByNumber(e_ref)->name()
       << " without corresponding base enum with value " << e;
-  ASSERT_LT(DataType_MAX, e_ref) << "Gap in reference types, missing value for "
-                                 << e_ref;
+  ASSERT_LT(DataType_MAX, e_ref)
+      << "Gap in reference types, missing value for " << e_ref;
 
   // Make sure there are no enums defined after the last regular type before
   // the first reference type.
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index b1e6cf64e837a04b0121a5e2c5c5a905cf1821f6..f47c9830862fbe30a9f3d753ec2919367345eaac 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -57,10 +57,10 @@ void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) {
     const int local_id = cm.Id(n);
     const int global_id = Id(n);
     if (local_id < 0 || global_id < 0) continue;
-    Ensure(global_id);
+    int num_slots = cm.slot_bytes_[local_id].size();
+    Ensure(global_id, num_slots);
     count_[global_id] += cm.count_[local_id];
     time_[global_id] += cm.time_[local_id];
-    int num_slots = cm.slot_bytes_[local_id].size();
     if (num_slots > 0) {
       if (slot_bytes_[global_id].empty()) {
         slot_bytes_[global_id].resize(num_slots);
@@ -78,11 +78,11 @@ void CostModel::MergeFromGlobal(const CostModel& cm) {
   CHECK(is_global_);
   CHECK_EQ(true, cm.is_global());
   const int num_nodes = cm.count_.size();
-  Ensure(num_nodes);
-  for (int i = 0; i < num_nodes; ++i) {
+  for (int i = num_nodes - 1; i >= 0; --i) {
     count_[i] += cm.count_[i];
     time_[i] += cm.time_[i];
     int num_slots = cm.slot_bytes_[i].size();
+    Ensure(i, num_slots);
     if (num_slots > 0) {
       if (slot_bytes_[i].empty()) {
         slot_bytes_[i].resize(num_slots);
@@ -106,7 +106,7 @@ void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
       // copy/send/recv nodes, feed/fetch, etc.
       if (iter == map.end()) continue;
       int32 global_id = iter->second;
-      Ensure(global_id);
+      Ensure(global_id, ns.output_size());
       int64 elapsed_micros = ns.op_end_rel_micros() - ns.op_start_rel_micros();
       count_[global_id]++;
       time_[global_id] += elapsed_micros;
@@ -122,7 +122,7 @@ void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
   }
 }
 
-void CostModel::Ensure(int id) {
+void CostModel::Ensure(int id, int num_outputs) {
   if (slot_bytes_.size() <= static_cast<size_t>(id)) {
     slot_bytes_.resize(id + 1);
     count_.resize(id + 1);
@@ -131,25 +131,37 @@ void CostModel::Ensure(int id) {
     max_exec_time_.resize(id + 1);
     output_port_alloc_ids_.resize(id + 1);
   }
+  if (num_outputs > 0) {
+    auto perslot = &slot_bytes_[id];
+    auto output_port_alloc_ids = &output_port_alloc_ids_[id];
+    auto max_mem_usage = &max_mem_usage_[id];
+
+    CHECK_LE(perslot->size(), num_outputs);
+    DCHECK_EQ(output_port_alloc_ids->size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_mem.size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_shape.size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_type.size(), perslot->size());
+
+    perslot->resize(num_outputs, Bytes(-1));
+    output_port_alloc_ids->resize(num_outputs, -1);
+    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
+    max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
+    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
+  }
 }
 
 void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  // Do not resize the number of slots before checking its existing number of
+  // slots.
+  Ensure(id, 0);
   auto perslot = &slot_bytes_[id];
-  auto max_mem_usage = &max_mem_usage_[id];
-  auto output_port_alloc_ids = &output_port_alloc_ids_[id];
   if (!perslot->empty()) {
-    CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node="
-                                           << node->name();
-  } else {
-    perslot->resize(num_outputs, Bytes(-1));
-    output_port_alloc_ids->resize(num_outputs, -1);
-    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
-    max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
-    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
+    CHECK_EQ(num_outputs, perslot->size())
+        << "Cannot resize slot_bytes, node=" << node->name();
   }
+  Ensure(id, num_outputs);
 }
 
 void CostModel::RecordCount(const Node* node, int count) {
@@ -198,7 +210,7 @@ void CostModel::RecordTime(const Node* node, Microseconds time) {
   const int id = Id(node);
   if (id < 0) return;
   DCHECK(node->IsOp()) << node->DebugString();
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   time_[id] += time;
 }
 
@@ -240,7 +252,10 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
                                     const DataType& dtype) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  CHECK_LT(output_slot, node->num_outputs())
+      << "Unexpected output slot for node " << node->DebugString() << ". Got "
+      << output_slot << " but its num_outputs is " << node->num_outputs();
+  Ensure(id, node->num_outputs());
   auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
   // If the memory allocator doesn't track memory usage, let's infer a lower
   // bound from the tensor shape and its data type.
@@ -316,7 +331,7 @@ void CostModel::RecordMemoryStats(const Node* node,
 void CostModel::RecordMaxExecutionTime(const Node* node, Microseconds time) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   max_exec_time_[id] = std::max(max_exec_time_[id], time);
 }
 
@@ -332,7 +347,7 @@ void CostModel::RecordAllocationId(const Node* node, int output_slot,
                                    int64 alloc_id) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   output_port_alloc_ids_[id][output_slot] = alloc_id;
 }
 
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 081eb2ff4c226c4dd5079f16cc6c2a102d0d2d63..9b703e46938b3355ed769045cdb3f298b48bb922 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -183,8 +183,8 @@ class CostModel {
 
   const bool is_global_;
 
-  // Resizes vectors so that they are large enough for "id".
-  void Ensure(int id);
+  // Resizes vectors so that they are large enough for "id" and id's outputs.
+  void Ensure(int id, int num_outputs);
 
   // Nodes and Edges whose count is < this value
   // get type/byte estimates of 0.
@@ -198,7 +198,7 @@ class CostModel {
   // Cumulative execution time.
   std::vector<Microseconds> time_;
   // Cumulative Bytes output on each channel.
-  std::vector<gtl::InlinedVector<Bytes, 2> > slot_bytes_;
+  std::vector<gtl::InlinedVector<Bytes, 2>> slot_bytes_;
 
   // Maximum execution time
   std::vector<Microseconds> max_exec_time_;
@@ -217,7 +217,7 @@ class CostModel {
   };
   std::vector<MemUsage> max_mem_usage_;
 
-  std::vector<gtl::InlinedVector<int64, 2> > output_port_alloc_ids_;
+  std::vector<gtl::InlinedVector<int64, 2>> output_port_alloc_ids_;
 
   std::set<int64> persistent_alloc_ids_;
   std::map<string, std::set<int64>> persistent_alloc_ids_by_devices_;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index b620127d9072a845721f97112f4bad107412b06f..93d8dd6f1100e9474b6e1c7afc56699163fc713f 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -62,8 +62,8 @@ class Node;
 class VersionDef;
 class WhileContext;
 
-class NeighborIter;  // Declared below
-class NodeIter;      // Declared below
+class NeighborIter;    // Declared below
+class NodeIter;        // Declared below
 class NodeProperties;  // Defined in .cc
 
 class Node {
diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc
index e85de71ef79988199cd194274f2ef9986e86d350..e928c81b45385ca0e10c2e6e1521d6d5d5a5eaf9 100644
--- a/tensorflow/core/graph/graph_def_builder_test.cc
+++ b/tensorflow/core/graph/graph_def_builder_test.cc
@@ -26,7 +26,6 @@ namespace tensorflow {
 namespace {
 
 TEST(GraphDefBuilderTest, Version) {
-
   // Verify that our assertions will be nontrivial
   ASSERT_LT(0, TF_GRAPH_DEF_VERSION);
 
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 3df981437afed760744ef870fd542d7abdd6e25d..1b99d54e8e33fd5155913a78ee833343bf92b905 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -21,102 +21,101 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
-  // Since our ops are going to produce and also consume N addition tensors
-  // (Mkl) for N Tensorflow tensors, we can have following different
-  // orderings among these 2N tensors.
-  //
-  // E.g., for Tensorflow tensors A, B, and C, our ops will produce and
-  // consume A_m, B_m, and C_m additionally.
-  //
-  // INTERLEAVED: in this case 2N tensors are interleaved. So for above
-  //              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
-  //
-  // CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
-  //             by N Mkl tensors. So for above example, the ordering looks
-  //             like: A, B, C, A_m, B_m, C_m
-  //
-  // Following APIs map index of original Tensorflow tensors to their
-  // appropriate position based on selected ordering. For contiguous ordering,
-  // we need to know the total number of tensors (parameter total).
-  //
-  typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
-  // NOTE: Currently, we use contiguous ordering. If you change this, then you
-  // would need to change Mkl op definitions in nn_ops.cc.
-  static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their
+// appropriate position based on selected ordering. For contiguous ordering,
+// we need to know the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
 
-  // Get index of MetaData tensor from index 'n' of Data tensor.
-  inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
-    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-      // For interleaved ordering, Mkl tensor follows immediately after
-      // Tensorflow tensor.
-      return n + 1;
-    } else {
-      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-      // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
-      return n + total_tensors / 2;
-    }
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
   }
+}
 
-  int inline GetTensorDataIndex(int n, int total_tensors) {
-      if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-        return 2 * n;  // index corresponding to nth input/output tensor
-      } else {
-        CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-        return n;
-      }
-    }
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
 
-  int inline GetTensorMetaDataIndex(int n, int total_tensors) {
-      // Get index for TensorData first and then use mapping function
-      // to get TensorMetaData index from TensorData index.
-      int tidx = GetTensorDataIndex(n, total_tensors);
-      return DataIndexToMetaDataIndex(tidx, total_tensors);
-    }
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
 
 namespace mkl_op_registry {
-  static const char* kMklOpLabel = "MklOp";
-  static const char* kMklOpLabelPattern = "label='MklOp'";
-  // Prefix that we add to Tensorflow op name to construct Mkl op name.
-  static const char* const kMklOpPrefix = "_Mkl";
+static const char* kMklOpLabel = "MklOp";
+static const char* kMklOpLabelPattern = "label='MklOp'";
+// Prefix that we add to Tensorflow op name to construct Mkl op name.
+static const char* const kMklOpPrefix = "_Mkl";
 
-  // Get the name of Mkl op from original TensorFlow op
-  // We prefix 'Mkl' to the original op to get Mkl op.
-  inline string GetMklOpName(const string& name) {
-    return string(kMklOpPrefix) + name;
-  }
+// Get the name of Mkl op from original TensorFlow op
+// We prefix 'Mkl' to the original op to get Mkl op.
+inline string GetMklOpName(const string& name) {
+  return string(kMklOpPrefix) + name;
+}
 
-  // Check whether opname with type T is registered as MKL-compliant.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as Mkl op; false otherwise
-  static inline bool IsMklOp(const std::string& op_name, DataType T) {
-    string kernel = KernelsRegisteredForOp(op_name);
-    bool result =
-        kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-    return result;
-  }
+// Check whether opname with type T is registered as MKL-compliant.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as Mkl op; false otherwise
+static inline bool IsMklOp(const std::string& op_name, DataType T) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  bool result =
+      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+  return result;
+}
 
-  // Check whether opname with type T is registered as MKL-compliant and
-  // is element-wise.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as element-wise Mkl op;
-  // false otherwise
-  static inline bool IsMklElementWiseOp(const std::string& op_name,
-    DataType T) {
-    if (!IsMklOp(op_name, T)) {
-      return false;
-    }
-    bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
-                    0 == op_name.compare(GetMklOpName("Sub")) ||
-                    0 == op_name.compare(GetMklOpName("Mul")) ||
-                    0 == op_name.compare(GetMklOpName("Maximum")) ||
-                    0 == op_name.compare(GetMklOpName("SquaredDifference")));
-
-    return result;
+// Check whether opname with type T is registered as MKL-compliant and
+// is element-wise.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as element-wise Mkl op;
+// false otherwise
+static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
+  if (!IsMklOp(op_name, T)) {
+    return false;
   }
+  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("Sub")) ||
+                 0 == op_name.compare(GetMklOpName("Mul")) ||
+                 0 == op_name.compare(GetMklOpName("Maximum")) ||
+                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+
+  return result;
+}
 }  // namespace mkl_op_registry
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 55bc401b9d61d43e1908faf0ac7e24639ec04c44..0e8a1cb26ce76855f334e16c8fa46c677735a34f 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -37,12 +37,12 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_layout_pass.h"
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -281,7 +281,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
-                                   "_MklConv2DWithBiasBackpropBias";
+        "_MklConv2DWithBiasBackpropBias";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
     csinfo_.reshape = "Reshape";
@@ -297,10 +297,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // End - element-wise ops. See note above.
 
     // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN,
-                      AddNRewrite, nullptr});
-    rinfo_.push_back({csinfo_.add,
-                      mkl_op_registry::GetMklOpName(csinfo_.add),
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite, nullptr});
+    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
@@ -337,14 +336,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsIdentity, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
                       CopyAttrsLRN, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
@@ -358,11 +357,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.mul,
-                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu,
-                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
@@ -373,8 +370,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.sub,
-                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
 
     // Add info about which ops to add workspace edge to and the slots.
@@ -388,9 +384,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
                                    IsBiasAddGradInMatMulContext};
 
-    biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
-                                   csinfo_.mkl_conv2d_with_bias,
-                                   IsBiasAddGradInConv2DWithBiasContext};
+    biasaddgrad_conv2dwithbias_context_ = {
+        csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias,
+        IsBiasAddGradInConv2DWithBiasContext};
 
     cinfo_.push_back(&biasaddgrad_matmul_context_);
     cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
@@ -410,9 +406,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   /// Structure to specify the context information used in a node rewrite rule
   typedef struct {
-    string node;     // Name of the node to be rewritten
-    string fwd;      // Name of the node in the forward pass that this node
-                     // corresponds to
+    string node;  // Name of the node to be rewritten
+    string fwd;   // Name of the node in the forward pass that this node
+                  // corresponds to
     std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
   } ContextInfo;
 
@@ -615,14 +611,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::vector<int32> ksize, strides;
     CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
     CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
-             true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
     CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
         GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
         GetTensorDim(strides, data_format, 'C') == 1) {
       return true;
     }
@@ -785,8 +780,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
       for (const Edge* fe : first_inp_of_filter->out_edges()) {
         if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
             fe->dst_input() == 0) {
-          VLOG(1) << "MklLayoutRewritePass: found "
-                  << fe->dst()->DebugString()
+          VLOG(1) << "MklLayoutRewritePass: found " << fe->dst()->DebugString()
                   << " as the forward node for matching context, backward"
                   << " node is: " << n->DebugString();
           *fwd_node = fe->dst();
@@ -803,13 +797,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //
   // @return - true (if BiasAddGrad is associated with MatMul);
   //           false otherwise.
-  static bool IsBiasAddGradInMatMulContext(const Node* n,
-                                           const Node** fwd_node,
+  static bool IsBiasAddGradInMatMulContext(const Node* n, const Node** fwd_node,
                                            void* ci) {
     return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
   }
 
-
   // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
@@ -880,10 +872,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
-    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -900,7 +893,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //                                will feed the tensor
   // @return None
   void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -970,9 +964,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
 MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
+    MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_matmul_context_;
+    MklLayoutRewritePass::biasaddgrad_matmul_context_;
 std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
 
 // We register Mkl rewrite pass for phase 1 in post partitioning group.
@@ -1041,13 +1035,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-               .Attr("value", proto)
-               .Attr("dtype", dt)
-               .Device(orig_node->def().device())  // We place this node on
-                                                   // the same device as the
-                                                   // device of the original
-                                                   // node.
-               .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1060,8 +1054,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1069,11 +1063,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g,
-    Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -1090,8 +1082,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int mkl_node_output_slot = 0;
     GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
                               &mkl_node_output_slot);
-    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
-                                                mkl_node_output_slot));
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
     (*input_idx)++;
     list_length--;
   }
@@ -1101,9 +1093,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
-    Node* orig_node, Node* n,
-    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -1234,8 +1226,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
-                                     N, &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -1336,13 +1328,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                .Attr("value", proto)
-                .Attr("dtype", dt)
-                .Device(orig_node->def().device())  // We place this node on
-                                                    // same the device as the
-                                                    // device of the original
-                                                    // node.
-                .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1355,8 +1347,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1374,7 +1366,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
@@ -1400,8 +1393,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()),
-                                        T)) {
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
       // op. Corresponding fwd op is specified in 'fwd_op' field of
@@ -1416,7 +1410,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == mkl_op_registry::GetMklOpName(ws.fwd_op) &&
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           CHECK_NOTNULL(ws_tensors);
@@ -1593,7 +1588,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                            NodeBuilder* nb) {
   DataType T;
   DataType Tshape;
 
@@ -1869,8 +1864,8 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
       if (e->IsControlEdge()) {
         CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
       } else {
-        CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(),
-                                  e->dst_input()));
+        CHECK_NOTNULL(
+            (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input()));
       }
     }
 
@@ -1941,9 +1936,9 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
       // and leave BiasAddGrad as it is. But we check for this condition
       // when we check for node rewrite rule. So we should not even come
       // here for MatMul. So we will fail now.
-        return Status(
-            error::Code::INVALID_ARGUMENT,
-            "No rewrite is required for BiasAddGrad for MatMul context.");
+      return Status(
+          error::Code::INVALID_ARGUMENT,
+          "No rewrite is required for BiasAddGrad for MatMul context.");
     }
   }
 
@@ -2012,9 +2007,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     if (e->IsControlEdge()) {
       CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
-                            e->src()->num_outputs()),
-                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
     }
   }
 
@@ -2070,7 +2066,8 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
 
   // BiasAddGrad is not an Mkl layer, so we make an exception for it.
   if (n->type_string() != csinfo_.bias_add_grad) {
-    if (!mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) {
+    if (!mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(n->type_string()), T)) {
       return nullptr;
     }
   }
@@ -2186,8 +2183,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(
-  const GraphOptimizationPassOptions& options) {
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
@@ -2215,7 +2211,7 @@ Status MklLayoutRewritePass::Run(
   return Status::OK();
 }
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2421,7 +2417,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
     csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
     csinfo_.conv2d_grad_filter_with_bias =
-                              "__MklDummyConv2DBackpropFilterWithBias";
+        "__MklDummyConv2DBackpropFilterWithBias";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
     csinfo_.identity = "Identity";
@@ -2435,11 +2431,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
-                                   "_MklConv2DBackpropFilterWithBias";
+        "_MklConv2DBackpropFilterWithBias";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
-    csinfo_.tanh       = "Tanh";
-    csinfo_.tanh_grad  = "TanhGrad";
+    csinfo_.tanh = "Tanh";
+    csinfo_.tanh_grad = "TanhGrad";
     csinfo_.reshape = "Reshape";
     csinfo_.softmax = "Softmax";
     csinfo_.split = "Split";
@@ -2456,9 +2452,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
                       CopyAttrsAddN, AddNRewrite});
-    /* rinfo_.push_back({csinfo_.add,
+    rinfo_.push_back({csinfo_.add,
                       mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite}); */
+                      CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
                       CopyAttrsPooling, AlwaysRewrite});
@@ -2474,29 +2470,28 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
                       CopyAttrsConv2D, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.conv2d_with_bias,
-                      csinfo_.mkl_conv2d_with_bias,
+    rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
-                      csinfo_.mkl_conv2d_grad_filter_with_bias,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv2D,
+                      AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsDataType, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.lrn,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
                       CopyAttrsLRN, AlwaysRewrite});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
@@ -2507,14 +2502,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
                       CopyAttrsPooling, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.mul,
                       mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
     rinfo_.push_back({csinfo_.relu,
                       mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
@@ -2535,14 +2529,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.softmax,
                       mkl_op_registry::GetMklOpName(csinfo_.softmax),
                       CopyAttrsDataType, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.sub,
                       mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
+
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
@@ -2550,8 +2544,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // Add a rule for merging nodes
     minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
-                      csinfo_.conv2d_with_bias,
-                      GetConv2DOrBiasAdd});
+                      csinfo_.conv2d_with_bias, GetConv2DOrBiasAdd});
 
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
@@ -2846,9 +2839,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   // Default rewrite rule to be used in scenario 1 for rewrite.
   // @return - true (since we want to always rewrite)
-  static bool AlwaysRewrite(const Node* n) {
-    return true;
-  }
+  static bool AlwaysRewrite(const Node* n) { return true; }
 
   // Check if we are performing pooling on depth or batch. If it is, then we
   // do not rewrite MaxPool node to Mkl version.
@@ -2862,14 +2853,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::vector<int32> ksize, strides;
     CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
     CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
-             true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
     CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
         GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
         GetTensorDim(strides, data_format, 'C') == 1) {
       return true;
     }
@@ -2941,10 +2931,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
-    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -2961,7 +2952,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //                                will feed the tensor
   // @return None
   void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -3096,13 +3088,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-               .Attr("value", proto)
-               .Attr("dtype", dt)
-               .Device(orig_node->def().device())  // We place this node on
-                                                   // the same device as the
-                                                   // device of the original
-                                                   // node.
-               .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -3115,8 +3107,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     // Allow duplicate while adding control edge as it would fail (return
     // NULL) if we try to add duplicate edge.
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
@@ -3126,11 +3118,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g,
-    Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -3147,8 +3137,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int mkl_node_output_slot = 0;
     GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
                               &mkl_node_output_slot);
-    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
-                                                mkl_node_output_slot));
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
     (*input_idx)++;
     list_length--;
   }
@@ -3158,9 +3148,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
-    Node* orig_node, Node* n,
-    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -3292,8 +3282,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
-                                     N, &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -3394,13 +3384,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                .Attr("value", proto)
-                .Attr("dtype", dt)
-                .Device(orig_node->def().device())  // We place this node on
-                                                    // same the device as the
-                                                    // device of the original
-                                                    // node.
-                .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -3413,8 +3403,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     // Allow duplicate while adding control edge as it would fail (return
     // NULL) if we try to add duplicate edge.
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
@@ -3434,8 +3424,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-          orig_node->type_string()), T)) {
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
@@ -3461,8 +3451,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-                                          orig_node->type_string()), T)) {
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
       // op. Corresponding fwd op is specified in 'fwd_op' field of
@@ -3477,8 +3468,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == mkl_op_registry::GetMklOpName(
-                                                          ws.fwd_op) &&
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           CHECK_NOTNULL(ws_tensors);
@@ -3645,7 +3636,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                            NodeBuilder* nb) {
   DataType T;
   DataType Tshape;
 
@@ -3776,8 +3767,9 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
                                                     Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add &&
              n->type_string() == csinfo_.conv2d)) ||
-           ((n->type_string() == csinfo_.bias_add &&
-             m->type_string() == csinfo_.conv2d)), true);
+               ((n->type_string() == csinfo_.bias_add &&
+                 m->type_string() == csinfo_.conv2d)),
+           true);
 
   // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd,
   // BiasAdd is successor node, and Conv2D predecessor node.
@@ -3796,8 +3788,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(
-      GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
   // We check to ensure that data formats of both succ and pred are same.
   // We expect them to be same, so we can enforce this as assert.
   // But assert can be too strict, so we enforce this as a check.
@@ -3900,8 +3891,8 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
       // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
       // output (at slot 0).
       const int kConv2DWithBiasOutputSlot = 0;
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot,
-                                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(),
+                                  e->dst_input()));
     }
   }
 
@@ -3924,8 +3915,9 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
     std::unique_ptr<Graph>* g, Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
              n->type_string() == csinfo_.conv2d_grad_filter)) ||
-           ((n->type_string() == csinfo_.bias_add_grad &&
-             m->type_string() == csinfo_.conv2d_grad_filter)), true);
+               ((n->type_string() == csinfo_.bias_add_grad &&
+                 m->type_string() == csinfo_.conv2d_grad_filter)),
+           true);
 
   // If 'm' is BiasAddGrad, then 'n' is BackpropFilter.
   Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n;
@@ -4132,9 +4124,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
       // NULL) if we try to add duplicate edge.
       CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
-                            e->src()->num_outputs()),
-                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
     }
   }
 
@@ -4166,9 +4159,9 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
-      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-                                        n->type_string()), T)) {
-      return nullptr;
+      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
+                                T)) {
+    return nullptr;
   }
 
   // For elementwise node, we reuse the Eigen implementation and pass the MKL
@@ -4184,29 +4177,30 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // eigen code to reduce cross-library dependency.
   VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
   if (mkl_op_registry::IsMklElementWiseOp(
-        mkl_op_registry::GetMklOpName(n->type_string()), T) ||
+          mkl_op_registry::GetMklOpName(n->type_string()), T) ||
       n->type_string().find("Identity") != string::npos) {
     VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
     bool incoming_mkl_edge = false;
     int num_parent = 0;
     for (auto parent : n->in_edges()) {
       if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is MKL op: "
-                << parent->src()->type_string();
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is MKL op: " << parent->src()->type_string();
         incoming_mkl_edge = true;
         break;
       } else {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is NON-MKL op: "
-                << parent->src()->type_string();
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is NON-MKL op: " << parent->src()->type_string();
       }
     }
     if (incoming_mkl_edge == false) {
-      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which has no MKL "
+      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which "
+                 "has no MKL "
                  "parents.";
       return nullptr;
     } else {
-      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() <<
-        " which has MKL parents";
+      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string()
+              << " which has MKL parents";
     }
   }
 
@@ -4214,8 +4208,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // for this op, then we rewrite it to Mkl op.
   // Find matching RewriteInfo and then check that rewrite rule applies.
   for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string().compare(ri->name) == 0 &&
-        ri->rewrite_rule(n)) {
+    if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
       return &*ri;
     }
   }
@@ -4297,8 +4290,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(
-  const GraphOptimizationPassOptions& options) {
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
@@ -4325,7 +4317,7 @@ Status MklLayoutRewritePass::Run(
 
   return Status::OK();
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 75f7ca2d4d7ce7c86858a40fe34fed6aa707c9e5..5e2a465e22c7cbe45cbea40ea7a11491e2b2ad24 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 namespace {
 
@@ -125,8 +125,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8")
-                        .Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -498,7 +500,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
             "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
 }
 
-
 // BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
   InitGraph(
@@ -874,11 +875,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -1273,7 +1275,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
       "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
       "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
@@ -1640,7 +1643,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }", kGPUDevice);
+      " input: ['B', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
 }
@@ -1666,7 +1670,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }", kGPUDevice);
+      " input: ['E'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
             "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
@@ -1687,7 +1692,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }", kGPUDevice);
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
@@ -1700,7 +1706,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1713,7 +1720,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }", kGPUDevice);
+      " input: ['A', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
@@ -1729,7 +1737,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1745,7 +1754,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1766,7 +1776,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
             "B->D:1;B:1->D:2;C->E;D->E:1");
@@ -1788,7 +1799,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['B:0', 'B:1', 'A']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
@@ -1808,7 +1820,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
       " attr { key: 'is_training'  value { b: true } }"
       " input: ['A', 'B', 'C', 'D', 'E'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }", kGPUDevice);
+      " input: ['A', 'F'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
             "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
@@ -1837,7 +1850,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
       "node { name: 'Y' op: 'Input'}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}", kGPUDevice);
+      " input: ['E', 'Y']}",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
             "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
@@ -1885,7 +1899,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 namespace {
 
@@ -1972,8 +1986,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8")
-                        .Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -2492,11 +2508,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -2891,7 +2908,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
       "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
       "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
@@ -3258,7 +3276,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }", kGPUDevice);
+      " input: ['B', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
 }
@@ -3284,7 +3303,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }", kGPUDevice);
+      " input: ['E'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
             "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
@@ -3305,7 +3325,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }", kGPUDevice);
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
@@ -3318,7 +3339,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3331,7 +3353,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }", kGPUDevice);
+      " input: ['A', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
@@ -3347,7 +3370,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3363,7 +3387,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3384,7 +3409,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
             "B->D:1;B:1->D:2;C->E;D->E:1");
@@ -3406,7 +3432,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['B:0', 'B:1', 'A']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
@@ -3426,7 +3453,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
       " attr { key: 'is_training'  value { b: true } }"
       " input: ['A', 'B', 'C', 'D', 'E'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }", kGPUDevice);
+      " input: ['A', 'F'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
             "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
@@ -3455,7 +3483,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
       "node { name: 'Y' op: 'Input'}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}", kGPUDevice);
+      " input: ['E', 'Y']}",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
             "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
@@ -3503,7 +3532,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 599bb88f015bfc035b7666747571a652a954139d..5343e6802d1e75f516925d44ab680b96f4e157da 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 
 namespace tensorflow {
 
@@ -152,12 +152,12 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   string data_format;
 
   TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
-  bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                          Status::OK();
+  bool dst_dtype_found =
+      GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK();
   // We compare source and destination datatypes only when both are found.
   if (dst_dtype_found && (src_datatype != dst_datatype)) {
-    string err_msg = "T attribute of " + src->name() + " and " +
-                      dst->name() + " do not match. Will not insert" +
+    string err_msg = "T attribute of " + src->name() + " and " + dst->name() +
+                     " do not match. Will not insert" +
                      " MklToTf node in such case.";
     return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
   }
@@ -325,12 +325,12 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
     // may not be Mkl node.
     DataType src_datatype;
     DataType dst_datatype;
-    bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(src->type_string(), src_datatype));
-    bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(dst->type_string(), dst_datatype));
+    bool src_is_mkl_op =
+        (GetNodeAttr(src->def(), "T", &src_datatype) == Status::OK() &&
+         IsMklSupportedOp(src->type_string(), src_datatype));
+    bool dst_is_mkl_op =
+        (GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK() &&
+         IsMklSupportedOp(dst->type_string(), dst_datatype));
 
     // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
     if (src_is_mkl_op && !dst_is_mkl_op) {
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 172471e34bc5ce344a4a8db2d404b77b7406c99f..0d88d1ff723b94783693559926c51c6726a2341b 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -40,7 +40,7 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests
@@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) {
   return Binary(g, "ReverseV2", tensor, axis);
 }
 
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry())
+                  .Input(input)
+                  .Input(shift)
+                  .Input(axis)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Error(Graph* g, Node* input, const string& errmsg) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 06597778bb204c83dae7699e1ffe0e2b196ac160..eb9038d619ed273bbfd2596bce964fda005b4ec1 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
 // Output dtype determined by lam.
 Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
 
+// Rolls tensor by an offset of <shift> along the corresponding
+// <axis> dimensions.
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis);
+
 // Generates random parameters from the truncated standard normal distribution
 // of the nput shape
 Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 01a618ed7775eee64ce40e283394c09622353157..39bfca244ed2d40544dd2a17a019dadbe50f6d29 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
   DisableDetailedStats(false);
 }
 
-Cluster::~Cluster() {
-}
+Cluster::~Cluster() {}
 
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
   options_.config.set_allow_soft_placement(soft_placement_state);
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 2712c5b67910c2d10a13237673cc671222955fbb..862ce4ae8883f394fd299914e245a69f1962f564 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -36,10 +36,7 @@ namespace grappler {
 static std::atomic<bool> already_provisioned(false);
 
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
-    : Cluster(timeout_s),
-      num_gpus_(num_gpus),
-      expected_init_time_s_(0),
-      closing_(false) {
+    : Cluster(timeout_s), expected_init_time_s_(0), closing_(false) {
   VLOG(1) << "Number of CPU cores: " << num_cpu_cores
           << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index a254f72f0c7719e49d4f52e8cc42181a09071801..90d6a04cab650178db0dc14ac94564690b0d7bbb 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -64,7 +64,6 @@ class SingleMachine : public Cluster {
 
   Status ClearAllocatorStats() const;
 
-  const int num_gpus_;
   std::unique_ptr<Session> session_;
   std::vector<QueueRunnerDef> queue_runner_defs_;
   string last_graph_id_;
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index 1c2c1713834a11d0a7c85247e9a7e4cdf779c592..f24192247113bfe91884a9c557f46cc29986ff9a 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,7 +102,7 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9150), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
 
   // Make this estimate accurate:
   // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 852e69737baa14e0d05de1fdcb6fc24a143f6a2d..9e01ec5ff5b48b9f979695b0a4b7b089245145c0 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -78,6 +78,9 @@ struct Costs {
     MilliSeconds asMilliSeconds() const {
       return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
     }
+    static NanoSeconds infinity() {
+      return NanoSeconds(std::chrono::nanoseconds::max());
+    }
   };
 
   // We store all our times in nanoseconds. If needs be, we can always switch to
@@ -85,10 +88,7 @@ struct Costs {
   typedef NanoSeconds Duration;
 
   // Overall cost of running the graph; latency.
-  // Mean
   Duration execution_time;
-  Duration min_execution_time;
-  Duration max_execution_time;
 
   // Computation cost of running the graph.
   Duration compute_time;
@@ -100,6 +100,8 @@ struct Costs {
   // requirements of a graph. For example, it might assume that all activations
   // are live for all of a graph's execution.
   int64 max_memory;  // Maximum main memory requirement in bytes over all ops.
+  int64 persistent_memory;
+  int64 temporary_memory;
 
   // These fields are used for TPU-related estimations. They are per-op
   // maximums, so each op is evaluated independently, but we want the maximum of
@@ -132,6 +134,8 @@ Costs::Costs() {
   compute_time = Duration::zero();
   memory_time = Duration::zero();
   max_memory = kMemoryUnknown;
+  persistent_memory = kMemoryUnknown;
+  temporary_memory = kMemoryUnknown;
   max_per_op_buffers = kMemoryUnknown;
   max_per_op_streaming = kMemoryUnknown;
 }
@@ -142,6 +146,8 @@ Costs Costs::ZeroCosts() {
   costs.compute_time = Duration::zero();
   costs.memory_time = Duration::zero();
   costs.max_memory = kZeroMemory;
+  costs.persistent_memory = kZeroMemory;
+  costs.temporary_memory = kZeroMemory;
   costs.max_per_op_buffers = kZeroMemory;
   costs.max_per_op_streaming = kZeroMemory;
   return costs;
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index 8fd1801863ad9aadd6e9f1bbde4b90600189d77c..ea4320687af366ccdd82e46cf28adf4ee9c100c0 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -117,8 +117,6 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     LOG(ERROR) << "Failed to measure graph performance: "
                << status.error_message();
     costs->execution_time = Costs::Duration::max();
-    costs->max_execution_time = Costs::Duration::max();
-    costs->min_execution_time = 0;
     return status;
   }
 
@@ -126,8 +124,6 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   // to filter out outliers.
   RobustStats stats(times);
   costs->execution_time = Costs::Duration(stats.mean());
-  costs->max_execution_time = Costs::Duration(stats.hi());
-  costs->min_execution_time = Costs::Duration(stats.lo());
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 6bc136a3f89c9a1dbfd4be15c143d4c893897494..cf317374cfa2bfe1d587e8e4d54a1234717abaa9 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -47,6 +47,8 @@ constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
 
+static const Costs::Duration kMinComputeTime(1);
+
 namespace {
 
 string GetDataFormat(const OpInfo& op_features) {
@@ -163,18 +165,20 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
 
-      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kSend, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
+
+      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kReshape, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
+
+      {kConst, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)},
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
@@ -429,6 +433,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     costs.execution_time = compute_cost + memory_cost;
   }
   costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
   return costs;
 }
 
@@ -885,6 +890,30 @@ Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
   return Costs::ZeroCosts();
 }
 
+Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  // Assign the minimum amount of time we can represent to the identity op since
+  // it tends to be really cheap.
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.compute_time;
+  return result;
+}
+
+Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.persistent_memory =
+      CalculateOutputSize(op_features, &result.inaccurate);
+
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.execution_time;
+  return result;
+}
+
 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
@@ -898,13 +927,12 @@ Costs OpLevelCostEstimator::PredictBatchMatMul(
 
 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
-  Costs costs;
+  Costs costs = Costs::ZeroCosts();
   costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
-  costs.execution_time = 1;
-  costs.compute_time = 1;
-  costs.memory_time = 0;
+  costs.compute_time = kMinComputeTime;
+  costs.execution_time = costs.compute_time;
 
   return costs;
 }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 5f541ccf04dc74eb868d26365a50d2e3542ea7d9..a292e5e97fe52383648d74b08bb7a384b6278446 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -132,6 +132,8 @@ class OpLevelCostEstimator {
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
+  Costs PredictIdentity(const OpContext& op_context) const;
+  Costs PredictVariable(const OpContext& op_context) const;
   Costs PredictBatchMatMul(const OpContext& op_context) const;
   Costs PredictMetadata(const OpContext& op_context) const;
 
diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
index 1d623b8db8e5cc3b4e7e6b32d83695ab4ed4c0ec..37f9ebd6a146c8c0089857c7a41ba863b4c2fb1f 100644
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -58,11 +58,18 @@ message LogNormalDistribution {
   double sigma = 2;
 }
 
+message SessionInfo {
+  int64 intra_op_parallelism = 1;
+}
+
 // Performance data for tensorflow operations
 message OpPerformance {
   // The op
   OpInfo op = 1;
 
+  // Information about the session configs.
+  SessionInfo session_info = 12;
+
   // The node name (optional). Makes it easier to associate the performance data
   // with a specific graph node.
   string node = 5;
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index d7d07ee7a55665a2d809588f45fbfd166bd2f76a..020492a3e9e23a8360a5e8804bc51ba6c5de67d1 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -323,8 +323,13 @@ Status VirtualScheduler::Init() {
   }
 
   // Get the nodes that would run to output fetch_nodes.
+  bool ill_formed = false;
   std::vector<const NodeDef*> nodes =
-      ComputeTransitiveFanin(graph, fetch_nodes);
+      ComputeTransitiveFanin(graph, fetch_nodes, &ill_formed);
+  if (ill_formed) {
+    return errors::InvalidArgument(
+        "Ill formed graph or invalid set of fetch nodes specified");
+  }
 
   // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
   // ComputeTransitiveFanin().
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 8ccc51f5451bb2b5052fd04100ba7684b0956cea..5116c8183cb4c51dc833988cbeb75a4a184e4c40 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -139,8 +139,8 @@ class FIFOManager : public ReadyNodeManager {
  public:
   FIFOManager() : ReadyNodeManager() {}
   ~FIFOManager() override {}
-  virtual void Init(
-      const std::unordered_map<const NodeDef*, NodeState>* node_state) {}
+  void Init(const std::unordered_map<const NodeDef*, NodeState>* node_state)
+      override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
   const NodeDef* GetCurrNode() override {
     CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
@@ -325,7 +325,7 @@ class VirtualScheduler {
   // Boolean field for whether the cost is accurate.
   std::map<string, std::pair<int, bool>> op_costs_;
 
-  Costs graph_costs_;                // Graph cost.
+  Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
   // Auxilliary data structures for constructing NodeState and DeviceState.
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index f4e2de75a60182f3b2bbc366c076052bd0fae118..173ce9c09c2fd98d855a801131ed16a796d9caac 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -46,6 +46,7 @@ class GraphView {
   };
 
   explicit GraphView(GraphDef* graph);
+  GraphDef* GetGraph() const { return graph_; }
   NodeDef* GetNode(const string& node_name) const;
   // Get the specified input port. Note that the special '-1' port_id can be
   // used to access the controlling nodes (i.e. the nodes connected to node_name
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 149f6fc7353b3c96e9d780c20697873c15bccaa8..2f8549cf395f6b78154f7a6faf3fea06ea6c56c4 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -134,6 +134,7 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
     const NodeDef* node = name_to_node[NodeName(root)];
     if (!node) {
       *ill_formed = true;
+      VLOG(2) << "ComputeTransitiveFanin: problem with root node: " << root;
       return {};
     }
     queue.push_back(node);
@@ -153,6 +154,7 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
     for (const string& input : node->input()) {
       const NodeDef* in = name_to_node[NodeName(input)];
       if (!in) {
+        VLOG(2) << "ComputeTransitiveFanin: problem with node: " << input;
         *ill_formed = true;
         return {};
       }
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index a17e1c9ff2a5e1521250e604192d21650732e795..b597319261011e2537848a34167f69cf1e3002f0 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -18,8 +18,8 @@ limitations under the License.
 // that may be stored in the checkpoint are not restored in order to speedup the
 // initialization.
 
-#ifndef LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
-#define LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
 
 #include <stddef.h>
 #include <limits>
@@ -53,4 +53,4 @@ class FileInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 791ad34bbed6a4c7d270f3a06ac34ed0f08b9b1a..8b9885e4c161a609b8f5c1d2bd5c63a9fbad3281 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -285,9 +285,11 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index c5d2d47782f0d5515e65e1f99b212315dcc13c0e..8d1098d87755c1257dfebe016a3baf86bfece677 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
-#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 1f68ecbade9147b652ac970aa1c5ec4b056209c7..0842fc92a822ccf1d7e052e434307c2a974daef9 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -36,20 +36,20 @@ namespace grappler {
 
 namespace {
 
-int RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
-  int num_removed = 0;
+bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
+  bool removed_input = false;
   int pos = 0;
   while (pos < node->input_size()) {
     if (node->input(pos) == input) {
       node->mutable_input()->SwapElements(pos, node->input_size() - 1);
       node->mutable_input()->RemoveLast();
       node_map->RemoveOutput(NodeName(input), node->name());
+      removed_input = true;
     } else {
       ++pos;
     }
-    ++num_removed;
   }
-  return num_removed;
+  return removed_input;
 }
 
 // Remove duplicate control inputs.
@@ -58,11 +58,7 @@ void PruneControlInputs(NodeDef* node) {
   int pos = 0;
   while (pos < node->input_size()) {
     const string& input = node->input(pos);
-    // TODO(rmlarsen): Remove control inputs that also appears as a regular
-    // inputs. Currently, doing so breaks testControlFlowStrictness in
-    // python/framework/function_test.
-    //    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
-    if (IsControlInput(input) && !inputs.insert(input).second) {
+    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
       VLOG(1) << "**** Removing duplicate control input: " << input
               << " from node " << node->DebugString();
       node->mutable_input()->SwapElements(pos, node->input_size() - 1);
@@ -75,6 +71,43 @@ void PruneControlInputs(NodeDef* node) {
 
 }  // namespace
 
+bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
+  if (!IsIdentity(node)) {
+    return true;
+  }
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
+  if (!fetch_nodes_known_) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
+  CHECK(input != nullptr) << "node = " << node.name()
+                          << " input = " << node.input(0);
+  // Don't remove Identity nodes corresponding to Variable reads or following
+  // Recv.
+  if (IsVariable(*input) || IsRecv(*input)) {
+    return false;
+  } else if (IsSwitch(*input)) {
+    // Don't turn Identity nodes following Switch into NoOp or remove them
+    // if it requires anchoring a control dependencies the Switch node, which
+    // is not valid.
+    if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) {
+      // TODO(rmlarsen): Try to remove this artificial contraint.
+      return false;
+    }
+    for (auto consumer : node_map_->GetOutputs(node.name())) {
+      for (const string& consumer_input : consumer->input()) {
+        if (consumer_input == AsControlDependency(node.name())) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -104,18 +137,8 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
     return false;
   }
 
-  // Don't turn Identity nodes inserted by Grappler after Switch into NoOp,
-  // since we cannot anchor control dependencies on Switch nodes.
-  // Don't remove Identity nodes corresponding to Variable reads.
-  if (IsIdentity(node)) {
-    const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
-    if (input != nullptr) {
-      if (IsVariable(*input) ||
-          (StringPiece(node.name()).starts_with(kConstantFoldingCtrl) &&
-           IsSwitch(*input))) {
-        return false;
-      }
-    }
+  if (!SafeToRemoveIdentity(node)) {
+    return false;
   }
 
   const std::unordered_set<string> do_not_rewrite_ops{
@@ -128,19 +151,22 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
 void DependencyOptimizer::OptimizeNode(int node_idx,
                                        SetVector<int>* nodes_to_simplify,
                                        std::set<int>* nodes_to_delete) {
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
   NodeDef* node = optimized_graph_->mutable_node(node_idx);
-
+  const bool is_noop = IsNoOp(*node);
+  const bool is_identity = IsIdentity(*node);
+  const string node_name = node->name();
   // Constant nodes with no input control dependency are always executed early,
   // so we can prune all their output control dependencies.
   if (IsConstant(*node) && node->input_size() == 0) {
-    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node->name());
+    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node_name);
     for (NodeDef* fanout : output_nodes) {
       bool optimize_fanout = false;
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
         int pos;
         string input_name = ParseNodeName(fanout->input(i), &pos);
-        if (input_name == node->name()) {
+        if (input_name == node_name) {
           if (pos < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
             fanout->mutable_input()->RemoveLast();
@@ -153,22 +179,21 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       if (optimize_fanout) {
         nodes_to_simplify->PushBack(node_to_idx_[fanout]);
         if (!data_connection) {
-          node_map_->RemoveOutput(node->name(), fanout->name());
+          node_map_->RemoveOutput(node_name, fanout->name());
         }
       }
     }
-    if (node_map_->GetOutputs(node->name()).empty() && fetch_nodes_known_ &&
-        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+    if (node_map_->GetOutputs(node_name).empty() && fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
       nodes_to_delete->insert(node_to_idx_[node]);
     }
-
     return;
   }
 
   // Change ops that only have control dependencies as outputs to NoOps.
-  if (node->op() != "NoOp" && SafeToConvertToNoOp(*node)) {
-    VLOG(1) << "***** Replacing  " << node->name() << " (" << node->op()
+  if (!is_noop && SafeToConvertToNoOp(*node)) {
+    VLOG(1) << "***** Replacing  " << node_name << " (" << node->op()
             << ") with NoOp.";
     // The outputs of this node are not consumed. Replace its inputs with
     // control dependencies and replace the op itself with the NoOp op.
@@ -190,7 +215,7 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
           old_input, optimized_graph_, node_map_.get());
       if (ctrl_inputs.insert(ctrl_input).second) {
         node->set_input(pos, ctrl_input);
-        node_map_->UpdateInput(node->name(), old_input, ctrl_input);
+        node_map_->UpdateInput(node_name, old_input, ctrl_input);
         const NodeDef* old_input_node = node_map_->GetNode(old_input);
         nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
       }
@@ -198,6 +223,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
     }
     node->set_op("NoOp");
     node->clear_attr();
+    nodes_to_simplify->PushBack(node_to_idx_[node]);
+    return;
   }
 
   // Remove NoOp nodes if the product of their fan-in and fan-out is less than
@@ -226,9 +253,30 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
   // a and x, respectively, are on the same device. Control edges across device
   // boundaries require inter-device communication (Send/Recv pairs to be
   // inserted in the graph), which is very costly.
+  //
+  // We also remove identity nodes, subject to the same constraints on number of
+  // resulting control edges and device boundary crossings:
+  //
+  // Case a)
+  //          +----------+ ---> a       +---+ ---> a
+  //    x --> | Identity | --^> b  ==>  | x | --^> b
+  //          |          | ...          |   | ...
+  //          +----------+ --^> c       +---+ --^> c
+  //
+  // Case b)
+  //    x ---> +----------+ ---> a      x ---> +---+
+  //    y --^> | Identity |        ==>  y --^> | a |
+  //    ...    |          |               ...  |   |
+  //    z --^> +----------+             z --^> +---+
+  //
+  // Case c)
+  //           +----------+             x ---> +---+
+  //    x ---> | Identity | ---> a ==>   \--^> | a |
+  //    y --^> |          | --^> b       /\    +---+
+  //           +----------+             y --^> b
 
-  if (node->op() == "NoOp") {
-    const auto& output_node_set = node_map_->GetOutputs(node->name());
+  if (is_noop || (is_identity && is_aggressive)) {
+    const auto& output_node_set = node_map_->GetOutputs(node_name);
     const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
                                              output_node_set.end());
     const int num_outputs = output_nodes.size();
@@ -237,15 +285,14 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
     if (num_inputs * num_outputs > num_inputs + num_outputs) {
       return;
     }
-    VLOG(1) << "***** Rerouting input around " << node->name();
     std::vector<NodeDef*> input_nodes;
     for (int i = 0; i < num_inputs; ++i) {
-      NodeDef* tmp = node_map_->GetNode(node->input(i));
-      CHECK_NE(tmp, nullptr);
-      input_nodes.push_back(tmp);
+      NodeDef* input_node = node_map_->GetNode(node->input(i));
+      CHECK_NE(input_node, nullptr);
+      input_nodes.push_back(input_node);
     }
 
-    // Make sure that we don't increase the number of control edges that cross
+    // Make sure that we don't increase the number of edges that cross
     // device boundaries.
     if ((num_inputs == 1 && num_outputs > 1 &&
          input_nodes[0]->device() != node->device()) ||
@@ -270,40 +317,75 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       if (num_cross_after > num_cross_before) {
         return;
       }
+      // To avoid potentially removing Identity nodes following _Recv nodes,
+      // we require that no device crossings occur in that case.
+      // TODO(rmlarsen): See if we can relax this condition.
+      if (is_identity && (num_cross_after > 0 || num_cross_before > 0)) {
+        return;
+      }
+    }
+    if (is_identity && !SafeToRemoveIdentity(*node)) {
+      return;
     }
+
+    VLOG(1) << "***** Rerouting input around\n" << node->DebugString();
+    // Now remove the node and re-wire its inputs to its outputs.
     for (auto consumer : output_nodes) {
       bool updated_consumer = false;
-      VLOG(1) << "***** Considering consumer  " << consumer->name() << "\n"
-              << consumer->DebugString();
+      VLOG(1) << "consumer before:\n" << consumer->DebugString();
       for (int i = 0; i < num_inputs; ++i) {
         const NodeDef* input = input_nodes[i];
         // Forward dependency from input to consumer if it doesn't already
         // depend on it.
-        if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
-          consumer->add_input(AsControlDependency(input->name()));
+        if (is_identity && i == 0) {
+          // Replace regular input from Identity node.
+          bool found_input = false;
+          string new_input;
+          const string& input_to_forward = node->input(0);
+          CHECK(!IsControlInput(input_to_forward));
+          for (int j = 0; j < consumer->input_size(); ++j) {
+            const string& old_input = consumer->input(j);
+            if (old_input == node_name) {
+              new_input = input_to_forward;
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            } else if (old_input == AsControlDependency(NodeName(node_name))) {
+              new_input = AsControlDependency(NodeName(input_to_forward));
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            }
+          }
+          CHECK(found_input);
           updated_consumer = true;
-          node_map_->AddOutput(input->name(), consumer->name());
-          nodes_to_simplify->PushBack(node_to_idx_[input]);
+        } else {
+          // Forward dependency from input to consumer if it doesn't already
+          // depend on it.
+          if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
+            consumer->add_input(AsControlDependency(input->name()));
+            node_map_->AddOutput(input->name(), consumer->name());
+            nodes_to_simplify->PushBack(node_to_idx_[input]);
+            updated_consumer = true;
+          }
         }
       }
       // Remove dependency on node from consumer.
-      updated_consumer |= RemoveInput(
-          consumer, AsControlDependency(node->name()), node_map_.get());
+      updated_consumer |= RemoveInput(consumer, AsControlDependency(node_name),
+                                      node_map_.get());
       if (updated_consumer) {
-        VLOG(1) << "***** Updated consumer  " << consumer->name() << " ("
-                << consumer->op() << ")";
         nodes_to_simplify->PushBack(node_to_idx_[consumer]);
       }
+      VLOG(1) << "consumer after:\n" << consumer->DebugString();
     }
-
-    node_map_->RemoveOutputs(node->name());
+    node_map_->RemoveOutputs(node_name);
     if (fetch_nodes_known_ &&
-        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
       nodes_to_delete->insert(node_idx);
 
-      // Unconnect the node from its inputs to enable further optimizations.
-      node_map_->RemoveInputs(node->name());
+      // Disconnect the node from its inputs to enable further optimizations.
+      node_map_->RemoveInputs(node_name);
       node->clear_input();
     }
   }
@@ -334,13 +416,18 @@ Status DependencyOptimizer::OptimizeDependencies() {
   std::set<int> nodes_to_delete;
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
     const NodeDef& node = optimized_graph_->node(i);
-    if (node.op() == "NoOp" || IsConstant(node) || SafeToConvertToNoOp(node)) {
+    if (IsNoOp(node) || IsIdentity(node) || IsConstant(node) ||
+        SafeToConvertToNoOp(node)) {
       nodes_to_simplify.PushBack(i);
     }
   }
   while (!nodes_to_simplify.Empty()) {
-    OptimizeNode(nodes_to_simplify.PopBack(), &nodes_to_simplify,
-                 &nodes_to_delete);
+    int node_to_simplify = nodes_to_simplify.PopBack();
+    // Discard nodes that were marked for deletion already.
+    while (nodes_to_delete.find(node_to_simplify) != nodes_to_delete.end()) {
+      node_to_simplify = nodes_to_simplify.PopBack();
+    }
+    OptimizeNode(node_to_simplify, &nodes_to_simplify, &nodes_to_delete);
   }
 
   if (fetch_nodes_known_) {
@@ -435,9 +522,10 @@ Status DependencyOptimizer::TransitiveReduction() {
       if (longest_distance[target] > 1) {
         const int input_slot = control_output.second;
         control_edges_to_remove[target].emplace(input_slot, source);
-        VLOG(1) << "Removing edge from:\n"
-                << optimized_graph_->node(source).DebugString() << "\n\nto:\n\n"
-                << optimized_graph_->node(target).DebugString();
+        //        VLOG(1) << "Removing edge from:\n"
+        //                << optimized_graph_->node(source).DebugString() <<
+        //                "\n\nto:\n\n"
+        //                << optimized_graph_->node(target).DebugString();
       }
     }
   }
@@ -477,8 +565,8 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   *optimized_graph_ = item.graph;
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
-
   CleanControlInputs();
+
   const int num_iterations = 2;
   for (int iteration = 0; iteration < num_iterations; ++iteration) {
     Status topo_sort_status;
@@ -495,9 +583,12 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     } else {
       LOG(ERROR) << topo_sort_status.error_message();
     }
-
-    // Turn nodes with only control outputs into NoOps, prune NoOps.
+    // Turn nodes with only control outputs into NoOps, prune NoOp and Identity
+    // nodes.
     TF_RETURN_IF_ERROR(OptimizeDependencies());
+
+    // Dedup control inputs.
+    CleanControlInputs();
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 02d8a0f32a9bbe4e49c484ece601e219257908c0..0f47528a0435d3e90d92b07306d7b1a4a072ce27 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -43,6 +43,9 @@ class DependencyOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  // Returns true if node is not an Identity node or if it is an Identity
+  // that is safe to remove.
+  bool SafeToRemoveIdentity(const NodeDef& node);
   // Returns true if it is safe to convert node to NoOp.
   bool SafeToConvertToNoOp(const NodeDef& node);
   // Removes all duplicate control dependencies.
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index f5027a4a99e4f28b4b49df914e9247a008036c20..b8facb9deae20fdf7d3208bb5d1e94c3f883576e 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -167,14 +167,16 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) {
       ops::Const(scope.WithOpName("c2").WithControlDependencies(ctrl_dep_id),
                  {1.0f, 2.0f}, {1, 2});
   Output neg1 = ops::Neg(scope.WithOpName("neg1"), s.output_false);
+  Output neg2 = ops::Neg(scope.WithOpName("neg2"), ctrl_dep_id);
 
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("c1");
   item.fetch.push_back("c2");
   item.fetch.push_back("neg1");
+  item.fetch.push_back("neg2");
 
-  DependencyOptimizer optimizer;
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -323,25 +325,148 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  Output z = ops::RandomUniform(s.WithOpName("z"), {1, 2}, DT_FLOAT);
+
+  // Identity nodes to be removed.
+  // Case a) with a single input- and multiple outputs.
+  auto id_a = ops::Identity(s.WithOpName("id_a"), x);
+  // Case b) with multiple inputs and a single output.
+  auto id_b = ops::Identity(
+      s.WithOpName("id_b").WithControlDependencies(y).WithControlDependencies(
+          z),
+      x);
+  // Case c) with two inputs and two outputs.
+  auto id_c = ops::Identity(s.WithOpName("id_c").WithControlDependencies(y), x);
+
+  // Output for Case a.
+  Output a_a = ops::Identity(s.WithOpName("a_a"), id_a);
+  Output a_b = ops::Identity(s.WithOpName("a_b"), id_a);
+  Output a_c =
+      ops::Identity(s.WithOpName("a_c").WithControlDependencies(id_a), z);
+  Output a_d =
+      ops::Identity(s.WithOpName("a_d").WithControlDependencies(id_a), z);
+  // Output for Case b.
+  Output b_a = ops::Identity(s.WithOpName("b_a"), id_b);
+  // Output for Case c.
+  Output c_a = ops::Identity(s.WithOpName("c_a"), id_c);
+  Output c_b =
+      ops::Identity(s.WithOpName("c_b").WithControlDependencies(id_c), z);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"a_a", "a_b", "a_c", "a_d", "b_a", "c_a", "c_b"};
+
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 3, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id_a", node.name());
+    EXPECT_NE("id_b", node.name());
+    EXPECT_NE("id_c", node.name());
+    if (node.name() == "a_a" || node.name() == "a_b") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    }
+    if (node.name() == "a_c" || node.name() == "a_d") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+    }
+    if (node.name() == "b_a") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+      EXPECT_EQ("^z", node.input(2));
+    }
+    if (node.name() == "c_a") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    }
+    if (node.name() == "c_b") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) {
+  // Corner cases with repeated inputs.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable x(scope.WithOpName("x"), {}, DT_BOOL);
+  ops::Variable y(scope.WithOpName("y"), {}, DT_BOOL);
+  ops::Switch sw(scope.WithOpName("switch"), x, x);
+  // id0 should be removed.
+  Output id0 = ops::Identity(scope.WithOpName("id0"), sw.output_true);
+  // id1 should not be removed, since it would anchor a control dependency
+  // on the switch.
+  Output id1 = ops::Identity(scope.WithOpName("id1"), sw.output_false);
+  Output or0 = ops::LogicalOr(scope.WithOpName("or0"), id0, id0);
+  Output or1 = ops::LogicalOr(scope.WithOpName("or1"), id0, y);
+  Output or2 = ops::LogicalOr(
+      scope.WithOpName("or2").WithControlDependencies(id1), y, y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("or0");
+  item.fetch.push_back("or1");
+  item.fetch.push_back("or2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id0", node.name());
+    if (node.name() == "or0") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("switch:1", node.input(1));
+    }
+    if (node.name() == "or1") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+    if (node.name() == "or2") {
+      // or1 should be unchanged.
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^id1", node.input(2));
+    }
+  }
+}
+
 TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
   Output x = ops::Square(s.WithOpName("x"), c);
-  Output id1 = ops::Identity(s.WithOpName("id1"), x);
-  Output id2 =
-      ops::Identity(s.WithOpName("id2").WithControlDependencies({x}), id1);
+  Output neg1 = ops::Neg(s.WithOpName("neg1"), x);
+  Output neg2 =
+      ops::Neg(s.WithOpName("neg2").WithControlDependencies({x}), neg1);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch.push_back("id2");
-  DependencyOptimizer optimizer;
+  item.fetch.push_back("neg2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(4, output.node_size());
-  EXPECT_EQ("id2", output.node(3).name());
+  EXPECT_EQ("neg2", output.node(3).name());
   EXPECT_EQ(1, output.node(3).input_size());
-  EXPECT_EQ("id1", output.node(3).input(0));
+  EXPECT_EQ("neg1", output.node(3).input(0));
 }
 
 TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
@@ -356,20 +481,21 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   Output grappler_added_id = ops::Identity(
       scope.WithOpName("ConstantFoldingCtrl/switch_1"), s.output_true);
   Output c1 = ops::Const(scope.WithOpName("c1")
-                             .WithControlDependencies(id0)
                              .WithControlDependencies(id_after_var)
                              .WithControlDependencies(grappler_added_id),
                          {1.0f, 2.0f}, {1, 2});
   Output id1 = ops::Identity(scope.WithOpName("id1"), c1);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), id0);
   Output fetch =
       ops::Identity(scope.WithOpName("fetch").WithControlDependencies(id1), c1);
 
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("c1");
+  item.fetch.push_back("id2");
   item.fetch.push_back("fetch");
 
-  DependencyOptimizer optimizer;
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -377,8 +503,8 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   EXPECT_EQ(item.graph.node_size() - 2, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1" nor
-    // "id_after_var" should be eliminated.
+    // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1",
+    // "id_after_var, nor "id2"" should be eliminated.
     EXPECT_NE("id0", node.name());
     EXPECT_NE("id1", node.name());
     if (node.name() == "c1") {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 50e6ba4a6483cf55e32e3d04f1b3af42c48d9f87..433b3564fe562c782afac77087d120f2ccebf218 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2041,17 +2041,6 @@ class DataLayoutOptimizer : GraphProcessor {
   const LayoutOptimizer::TuningConfig& config_;
 };
 
-int GetNumTranspose(const GraphDef& graph) {
-  int number = 0;
-  for (const auto& node : graph.node()) {
-    if (IsTranspose(node)) {
-      number++;
-    }
-  }
-  VLOG(1) << "Number of Transpose nodes: " << number;
-  return number;
-}
-
 int GetNumGPUs(const Cluster& cluster) {
   auto devices = cluster.GetDevices();
   int num_gpus = 0;
@@ -2076,6 +2065,7 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
                              const TuningConfig& config, GraphDef* output) {
   auto status = graph_properties.AnnotateOutputShapes(output);
   if (!status.ok()) {
+    VLOG(1) << "Annotate shape return status: " << status.ToString();
     *output = item.graph;
     return status;
   }
@@ -2100,6 +2090,7 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   GraphProperties graph_properties(item);
   auto status = graph_properties.InferStatically(false);
   if (!status.ok()) {
+    VLOG(1) << "Infer shape return status: " << status.ToString();
     *output = item.graph;
     return status;
   }
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index f537ecc41b964fb6c5f2e24891891c9407fcffef..ffa03db2623ed92a6324d0f83f1919c3f0b547f7 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -496,7 +498,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     if (!IsAddN(node)) {
       continue;
     }
-    // There is nothing to gain by optimizing nodes with 2 inputs of fewer.
+    // There is nothing to gain by optimizing nodes with 2 or fewer inputs.
     if (view.NumFanins(node, false) <= 2) {
       continue;
     }
@@ -558,6 +560,54 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
       VLOG(1) << "Missing properties for " << node->name();
       continue;
     }
+
+    // Compute a topological ordering for the node fanin.
+    std::unordered_map<NodeDef*, int> topo_order;
+    ReverseDfs(view, {node}, nullptr,
+               [&topo_order](NodeDef* n) {
+                 int topo_index = topo_order.size();
+                 topo_order[n] = topo_index;
+               },
+               nullptr);
+
+    std::vector<int> input_topo_index;
+
+    for (int i = 0; i < node->input_size(); ++i) {
+      const string& input = node->input(i);
+      const string node_name = NodeName(input);
+      NodeDef* node = view.GetNode(node_name);
+      input_topo_index.push_back(topo_order.at(node));
+    }
+    int min_input_topo_index = INT_MAX;
+    int min_input_id = -1;
+    for (int i = 0; i < node->input_size(); ++i) {
+      if (IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      const int current = input_topo_index[i];
+      if (current < min_input_topo_index) {
+        min_input_topo_index = current;
+        min_input_id = i;
+      }
+    }
+    CHECK_LE(0, min_input_id);
+    std::vector<string> pre_ctrl_deps;
+    std::vector<string> post_ctrl_deps;
+    for (int i = node->input_size() - 1; i >= 0; --i) {
+      if (!IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      if (input_topo_index[i] < min_input_topo_index) {
+        // These control dependencies can be executed before the node.
+        pre_ctrl_deps.push_back(node->input(i));
+      } else {
+        // These control dependencies should be executed after the node.
+        post_ctrl_deps.push_back(node->input(i));
+      }
+    }
+
     const TensorShapeProto& shape =
         properties.GetOutputProperties(node->name())[0].shape();
     DataType dtype = node->attr().at("T").type();
@@ -572,13 +622,19 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     *(*tmp_var->mutable_attr())["shape"].mutable_shape() = shape;
     (*tmp_var->mutable_attr())["var_name"].set_s(tmp_var->name());
 
+    for (const string& ctrl_dep : pre_ctrl_deps) {
+      *tmp_var->add_input() = ctrl_dep;
+    }
+    *tmp_var->add_input() =
+        AsControlDependency(NodeName(node->input(min_input_id)));
+
     // Initialize it to zero
     NodeDef* zeros = item->graph.add_node();
     zeros->set_name(strings::StrCat(node->name(), "/tmp_var_zeros"));
     zeros->set_op("ZerosLike");
     zeros->set_device(device);
     (*zeros->mutable_attr())["T"].set_type(dtype);
-    *zeros->add_input() = node->input(0);
+    *zeros->add_input() = node->input(min_input_id);
 
     NodeDef* initialize = item->graph.add_node();
     initialize->set_name(strings::StrCat(node->name(), "/tmp_var_initializer"));
@@ -592,9 +648,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     std::vector<NodeDef*> accumulates;
     for (int i = 0; i < node->input_size(); ++i) {
       const string& input = node->input(i);
-      if (IsControlInput(input)) {
-        *zeros->add_input() = input;
-      } else {
+      if (!IsControlInput(input)) {
         NodeDef* accumulate = item->graph.add_node();
         accumulate->set_name(
             strings::StrCat(node->name(), "/tmp_var_accum_", i));
@@ -617,6 +671,10 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     for (const NodeDef* accum : accumulates) {
       *node->add_input() = AsControlDependency(accum->name());
     }
+    for (const string& ctrl_dep : post_ctrl_deps) {
+      *node->add_input() = ctrl_dep;
+    }
+
     updated_graph = true;
   }
 
@@ -828,8 +886,7 @@ static NodeDef* FindSwapOutTrigger(
   const std::unordered_set<GraphView::InputPort, GraphView::HashPort>& fanout =
       view.GetFanout(generator);
   NodeDef* trigger = nullptr;
-  Costs::NanoSeconds earliest_fanout(
-      static_cast<double>(std::numeric_limits<int64>::max() >> 2));
+  Costs::NanoSeconds earliest_fanout(Costs::NanoSeconds::infinity());
 
   for (const auto& port : fanout) {
     if (port.node == node) {
@@ -861,6 +918,15 @@ static bool IsSwappable(GraphView::InputPort input) {
   return !IsRefType(dtype);
 }
 
+struct MemInfo {
+  GraphView::OutputPort port;
+  int64 memory_used;
+  std::vector<GraphView::InputPort> uses_left;
+  double fitness;
+
+  bool operator<(const MemInfo& other) const { return fitness < other.fitness; }
+};
+
 static bool IdentifySwappingCandidates(
     Cluster* cluster, GrapplerItem* item, std::unordered_set<string>* skip_list,
     std::unordered_map<NodeDef*, SwapInfo>* nodes_to_swap) {
@@ -890,31 +956,56 @@ static bool IdentifySwappingCandidates(
       continue;
     }
     int64 required_savings = mem_usage.used_memory - prop.memory_size();
-    // TODO(bsteiner): sort the tensors by how long they're live.
 
-    std::unordered_map<string, Costs::NanoSeconds> execution_times;
+    std::unordered_map<string, Costs::NanoSeconds> op_completion_times;
     {
-      std::unordered_map<const NodeDef*, Costs::NanoSeconds>
-          tmp_execution_times;
-      if (!EstimateEarliestExecutionTimes(*item, cluster, &tmp_execution_times)
-               .ok()) {
+      VirtualCluster vcluster(cluster->GetDevices());
+      if (!vcluster.Provision().ok()) {
+        return false;
+      }
+      if (!vcluster.Initialize(*item).ok()) {
+        return false;
+      }
+      RunMetadata metadata;
+      Status s = vcluster.Run(item->graph, item->feed, item->fetch, &metadata);
+      if (!s.ok() && s.code() != error::RESOURCE_EXHAUSTED) {
         return false;
       }
-      for (const auto& exec_time : tmp_execution_times) {
-        execution_times.emplace(exec_time.first->name(), exec_time.second);
+
+      for (const auto& dev_stats : metadata.step_stats().dev_stats()) {
+        for (const auto& node_stats : dev_stats.node_stats()) {
+          Costs::NanoSeconds exec_time =
+              Costs::NanoSeconds(1) +
+              Costs::MicroSeconds(node_stats.all_start_micros() +
+                                  node_stats.op_end_rel_micros());
+          op_completion_times.emplace(node_stats.node_name(), exec_time);
+        }
+      }
+    }
+
+    Costs::Duration peak_time = -1;
+    for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.allocation_time > peak_time) {
+        peak_time = live_tensor.allocation_time;
       }
     }
 
+    std::vector<MemInfo> mem_state;
+
     GraphView graph(&item->graph);
     for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.memory_used <= 1024) {
+        // Don't bother with small tensors.
+        continue;
+      }
       if (live_tensor.deallocation_time - live_tensor.allocation_time <=
           Costs::Duration(1e6)) {
         // Not enough time to swap.
         VLOG(1) << "Not enough time to swap: skipping " << live_tensor.node;
         continue;
       }
-      if (live_tensor.memory_used <= 1024) {
-        // Don't bother with small tensors.
+
+      if (skip_list->find(live_tensor.node) != skip_list->end()) {
         continue;
       }
       GraphView::OutputPort port =
@@ -922,56 +1013,77 @@ static bool IdentifySwappingCandidates(
       if (!IsSwappable(graph, port)) {
         continue;
       }
-      Costs::NanoSeconds execution_time(-1);
-      GraphView::InputPort fanout_to_swap;
+      MemInfo mem_info;
+      mem_info.port = port;
+      mem_info.memory_used = live_tensor.memory_used;
+      Costs::Duration allocation_time = live_tensor.allocation_time;
+      Costs::Duration earliest_use(Costs::Duration::infinity());
+      bool valid = true;
       for (GraphView::InputPort input : graph.GetFanout(port)) {
-        if (skip_list->find(input.node->name()) != skip_list->end()) {
+        // Get execution time.
+        auto it = op_completion_times.find(input.node->name());
+        if (it == op_completion_times.end()) {
+          valid = false;
+          break;
+        }
+        if (it->second <= peak_time) {
           continue;
         }
+
+        if (skip_list->find(input.node->name()) != skip_list->end()) {
+          valid = false;
+          break;
+        }
         string input_name =
             strings::StrCat(input.node->name(), ":", input.port_id);
         if (skip_list->find(input_name) != skip_list->end()) {
-          continue;
+          valid = false;
+          break;
         }
         if (!IsSwappable(input)) {
-          continue;
-        }
-        auto it = execution_times.find(input.node->name());
-        if (it != execution_times.end()) {
-          if (it->second > execution_time) {
-            fanout_to_swap = input;
-            execution_time = it->second;
-          }
+          valid = false;
+          break;
         }
+
+        // Set earliest use time that's after peak.
+        mem_info.uses_left.emplace_back(input);
+        earliest_use = std::min(earliest_use, it->second);
       }
-      // Annotate the fanout to request the tensor to be swapped if it's not
-      // already been done.
-      bool found = false;
-      if (!fanout_to_swap.node) {
-        continue;
-      }
-      auto it = fanout_to_swap.node->attr().find("_swap_to_host");
-      if (it != fanout_to_swap.node->attr().end()) {
-        const AttrValue& val = it->second;
-        for (int port_id : val.list().i()) {
-          if (port_id == fanout_to_swap.port_id) {
-            found = true;
-            break;
-          }
-        }
+      if (valid && !mem_info.uses_left.empty()) {
+        // Compute the fitness: we need the tensor to be generated way away of
+        // the time of peak memory usage (to ensure there is enough time to swap
+        // it out). We also need to ensure it's used way after the peak time, to
+        // ensure that swapping the tensor back in won't recreate the memory
+        // bottleneck. Last but not least, we want the tensor to have as few
+        // remaining uses as possible.
+        mem_info.fitness = std::pow((earliest_use - peak_time).count(), 2);
+        mem_info.fitness /= std::pow(mem_info.uses_left.size(), 2);
+        mem_info.fitness += std::pow((allocation_time - peak_time).count(), 2);
+        mem_info.fitness = -mem_info.fitness;
+        mem_state.push_back(mem_info);
       }
-      if (!found) {
+    }
+
+    // Sort by fitness
+    std::sort(mem_state.begin(), mem_state.end());
+
+    for (const MemInfo& mem_info : mem_state) {
+      for (const GraphView::InputPort fanout_to_swap : mem_info.uses_left) {
+        VLOG(1) << "Will swap fanout " << fanout_to_swap.node->name() << ":"
+                << fanout_to_swap.port_id << " of tensor "
+                << mem_info.port.node->name() << ":" << mem_info.port.port_id
+                << " of size " << mem_info.memory_used;
+
         (*nodes_to_swap)[fanout_to_swap.node].inputs_to_swap.push_back(
             fanout_to_swap.port_id);
-        required_savings -= live_tensor.memory_used;
-        updated_graph = true;
-        if (required_savings < 0) {
-          break;
-        }
+      }
+      required_savings -= mem_info.memory_used;
+      updated_graph = true;
+      if (required_savings < 0) {
+        break;
       }
     }
   }
-
   return updated_graph;
 }
 
@@ -1011,7 +1123,7 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
   }
   for (auto& swap : nodes_to_swap) {
     const NodeDef* node = swap.first;
-    std::vector<OpInfo::TensorProperties> props =
+    const std::vector<OpInfo::TensorProperties>& props =
         properties.GetInputProperties(node->name());
     SwapInfo& swap_info = swap.second;
     int64 bytes_to_swap = 0;
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index dd2d20d8d682856a8a94f99e4ca2aa706331d9d4..f5d9c87992655c7fbd94919ca5a31f64207cd79c 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -337,8 +337,9 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
   for (const auto& node : output.node()) {
     if (node.name() == "e") {
       // The d node isn't swappable.
-      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ(5, node.input_size());
       EXPECT_EQ("d", node.input(2));
+      EXPECT_EQ("^swap_out_d_2", node.input(4));
     }
   }
 }
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 534f7a063fe90bf72f8a2afba7ae8f75b8472a36..137d51790d300603b9a9addd0f63c431c8105600 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -99,3 +99,29 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "traversal",
+    srcs = ["traversal.cc"],
+    hdrs = ["traversal.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "traversal_test",
+    srcs = ["traversal_test.cc"],
+    deps = [
+        ":traversal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f44f53c4e63805544fa480628e805303064edb3d
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge) {
+  // Stack of work to do.
+  struct StackElem {
+    NodeDef* node;
+    bool children_visited;
+    NodeDef* src;
+  };
+  std::vector<StackElem> stack;
+
+  stack.reserve(from.size());
+  for (NodeDef* node : from) {
+    stack.push_back(StackElem{node, false});
+  }
+
+  enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
+  std::unordered_map<NodeDef*, NodeState> node_state;
+  while (!stack.empty()) {
+    StackElem w = stack.back();
+    stack.pop_back();
+
+    if (w.children_visited) {
+      // We've processed all the children of this node
+      node_state[w.node] = DONE;
+      if (post_order) {
+        post_order(w.node);
+      }
+      continue;
+    }
+
+    auto& rslt = node_state[w.node];
+    if (rslt == DONE) {
+      continue;
+    } else if (rslt == VISITING) {
+      // Loop detected
+      if (on_back_edge) {
+        on_back_edge(w.src, w.node);
+      }
+      continue;
+    }
+    rslt = VISITING;
+    if (pre_order) {
+      pre_order(w.node);
+    }
+
+    // Enqueue the node again with the children_visited flag set to true.
+    stack.push_back(StackElem{w.node, true, w.src});
+
+    // Now enqueu the node children.
+    for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
+      stack.push_back(StackElem{fanin.node, false, w.node});
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb3fa090e8fdaf12ed6dcb18eb1511c55496a125
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+
+#include <functional>
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Traverse the graph in reverse dfs order, starting from the list of nodes
+// specified in the 'from' argument. The pre_order and post_order functors will
+// be called on each reachable node (including the 'from' nodes) in pre and post
+// order. If loops are found, the on_back_edge functor will be called on the
+// corresponding back edges. Moreover, the pre and post order will assume that
+// these back edges will be cut.
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc68bd1a9637cb6f61955e8fa5d495a34f19cb09
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+//#include "tensorflow/core/framework/node_def.pb.h"
+//#include "tensorflow/core/lib/core/status_test_util.h"
+//#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class TraversalTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "", inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(TraversalTest, ReverseDfsNoLoop) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(1),
+                                       graph.mutable_node(2)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  bool found_back_edge = false;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&found_back_edge](NodeDef*, NodeDef*) { found_back_edge = true; });
+
+  EXPECT_EQ(std::vector<string>({"1", "4", "3", "2", "5", "0"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"4", "5", "2", "3", "1", "0"}), post_order);
+  EXPECT_FALSE(found_back_edge);
+}
+
+TEST_F(TraversalTest, ReverseDfsWithLoop) {
+  GraphDef graph;
+  // Create a loop
+  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
+  *graph.add_node() = CreateNode("3", "Switch", {"2"});
+  *graph.add_node() = CreateNode("4", "Identity", {"3"});
+  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
+  *graph.add_node() = CreateNode("1", "Enter", {});
+  *graph.add_node() = CreateNode("6", "Exit", {"3"});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(5)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&back_edges](NodeDef* src, NodeDef* dst) {
+        back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
+      });
+
+  EXPECT_EQ(std::vector<string>({"6", "3", "2", "1", "5", "4"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"1", "4", "5", "2", "3", "6"}), post_order);
+  EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fd99409c9b35ae0ee2a3cbd9da9067fdc6434a8f..e7192ec42fe9b50f39207d739b440d3381bb78ad 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -629,6 +629,7 @@ cc_library(
         ":transpose_op",
         ":unique_op",
         ":unpack_op",
+        ":unravel_index_op",
         ":where_op",
     ],
 )
@@ -883,6 +884,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":split_lib"],
 )
 
+tf_kernel_library(
+    name = "unravel_index_op",
+    prefix = "unravel_index_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "where_op",
     srcs = ["where_op.cc"],
@@ -2582,6 +2589,45 @@ tf_cc_tests(
     ],
 )
 
+cc_library(
+    name = "manip",
+    deps = [
+        ":roll_op",
+    ],
+)
+
+MANIP_DEPS = [
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:manip_ops_op_lib",
+    "//third_party/eigen3",
+]
+
+tf_kernel_library(
+    name = "roll_op",
+    prefix = "roll_op",
+    deps = MANIP_DEPS,
+)
+
+tf_cc_test(
+    name = "roll_op_test",
+    size = "small",
+    srcs = ["roll_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":roll_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 MATH_DEPS = [
     ":bounds_check",
     ":fill_functor",
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 37976f71837cb365cd9d232c7c1e102ec5bfe338..72155fd037378fc3d93c02e9b893a6671e9659a6 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 template <typename Device, typename T>
 class AdjustContrastOp : public OpKernel {
  public:
-  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit AdjustContrastOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
index 0fc03b5a236b2d63fc731f232acebdcbd1ca2532..7522b320400b034aa882efb82efab8d0419d8144 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -29,8 +29,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-class AdjustContrastOpTest : public OpsTestBase {
-};
+class AdjustContrastOpTest : public OpsTestBase {};
 
 TEST_F(AdjustContrastOpTest, Simple_1113) {
   TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 4643d4e6efda2157458a557819873c8cb7546e1a..f0c6ae499d4c209ef1556890e87f63085de7ea75 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -192,8 +192,9 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
-                               int64 start_channel, int64 end_channel) {
+          kCostPerChannel,
+          [channel_count, &input_data, &output_data, scale_h](
+              int64 start_channel, int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index dfa3fe585e375ada0c5d3d0b3061d05d8a4efabd..aa1cead928aa25e9cf8d9c8d6d43091bf93583ee 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -201,7 +201,7 @@ struct Add7Functor<SYCLDevice, T> {
                   typename TTypes<T>::ConstFlat in6,
                   typename TTypes<T>::ConstFlat in7) {
     Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7);
+                                          in7);
   }
 };
 
@@ -214,7 +214,7 @@ struct Add8Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8);
+                                          in7, in8);
   }
 };
 
@@ -227,7 +227,7 @@ struct Add8pFunctor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
+                                           in7, in8);
   }
 };
 
@@ -241,10 +241,10 @@ struct Add9Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
       typename TTypes<T>::ConstFlat in9) {
     Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8, in9);
+                                          in7, in8, in9);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index cc8f122cab357ed0c8243ba990b3b85dd7ddcb2f..ce2fce92e4ee8cbd7bdc578d92103a5bd5da0629 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel {
     const int64 batch_size = input_shape.dim_size(0);
 
     const Tensor& window_size = context->input(1);
-    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
-                             window_size.shape().dim_size(0) == 2,
+    OP_REQUIRES(context,
+                (window_size.shape().dims() == 1) &&
+                    window_size.shape().dim_size(0) == 2,
                 errors::InvalidArgument(
                     "input must be a vector of size 2 (height, width)",
                     window_size.shape().DebugString()));
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
index dea2683184a06308bff7ead2b772aab466b90b34..f5e81dbc0930888ab9258d5d5b5d52fdeb0afc01 100644
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -48,9 +48,8 @@ struct SpatialAvgPooling {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
-// that works NHWC data formats.
-// Arguments:
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
+// operation that works NHWC data formats. Arguments:
 //   top_diff: backprop to the output of the pooling layer
 //   num: number of input batches
 //   height: input height
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 2be330d1427b28a01635cc1db5fd10096f2a8abe..6537b42f1ed8856a5f701023eb5fc55ded278ec8 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         int pool_size = (hend - hstart) * (wend - wstart);
-        gradient +=
-            top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
+                    dtype(pool_size);
       }
     }
     bottom_diff[index] = gradient;
@@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                             const GPUDevice& d) {
   int x_size = num * height * width * channels;
   CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      config.virtual_thread_count, top_diff, num, height, width, channels,
-      pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
-      pad_t, pad_t, bottom_diff);
+  AvePoolBackwardNHWC<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          config.virtual_thread_count, top_diff, num, height, width, channels,
+          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
+          pad_t, pad_t, bottom_diff);
 
   return d.ok();
 }
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index d0bbea9fe27856cc0dedb4570d285bd872741099..944564dfba62f257ae45b3c5c25d0de64fa0b773 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
       mutex_lock lock(mu_);
       if (closed_) {
         OP_REQUIRES_ASYNC(
-            ctx, !cancel_pending_enqueues_ &&
-                     (num_inserted == 0 || !incomplete_.empty()),
+            ctx,
+            !cancel_pending_enqueues_ &&
+                (num_inserted == 0 || !incomplete_.empty()),
             errors::Cancelled(
                 "Barrier ", name_, " is closed.  Pending enqueues cancelled: ",
-                cancel_pending_enqueues_, ".  Number of new insertions: ",
-                num_inserted, ".  Number of incomplete keys: ",
-                incomplete_.size(), "."),
+                cancel_pending_enqueues_,
+                ".  Number of new insertions: ", num_inserted,
+                ".  Number of incomplete keys: ", incomplete_.size(), "."),
             callback);
       }
 
@@ -128,9 +129,10 @@ class Barrier : public ResourceBase {
 
       for (int i = 0; i < num_inserted; ++i) {
         OP_REQUIRES_OK_ASYNC(
-            ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
-                                    component_index, i, &ready_tuples,
-                                    &new_elements),
+            ctx,
+            InsertOneLocked<T>(ctx, keys, values, element_shape,
+                               component_index, i, &ready_tuples,
+                               &new_elements),
             callback);
       }
 
@@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
         return errors::Cancelled(
             "Barrier ", name_,
             " is closed, but attempted to insert a brand new key: ",
-            keys_vec(i), ".  Pending enqueues cancelled: ",
-            cancel_pending_enqueues_, ".  Insertion index: ", i,
+            keys_vec(i),
+            ".  Pending enqueues cancelled: ", cancel_pending_enqueues_,
+            ".  Insertion index: ", i,
             ".  Number of incomplete keys: ", incomplete_.size(), ".");
       }
     } else {
@@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel {
     OP_REQUIRES_ASYNC(
         ctx, component_index_ < barrier->num_components(),
         errors::InvalidArgument("The component ID is out of range ",
-                                component_index_, " > num_components", " (= ",
-                                barrier->num_components(), ")"),
+                                component_index_, " > num_components",
+                                " (= ", barrier->num_components(), ")"),
         callback);
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
-                                  barrier->component_type(component_index_)},
-                                 {}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_STRING,
+                             barrier->component_type(component_index_)},
+                            {}),
         callback);
 
     const Tensor* keys;
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 5b4e1a809fa4b9e3d5c5e1b877233b31826bd386..c447db842d326ff79825be78cb77ec6aadc9b9eb 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 93c391831982c529fb8e270f6eb0cac8063bffbf..43e716c542ac42835baabde057e45534d5442010 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -429,14 +429,13 @@ template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-
-  // Number of matrix multiplies i.e. size of the batch.
-  const int64 batch_size = in_x.dim_size(0);
-  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
-                                        0, batch_size);
+    // Number of matrix multiplies i.e. size of the batch.
+    const int64 batch_size = in_x.dim_size(0);
+    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
+                                          out, 0, batch_size);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename Scalar>
 class BatchMatMul : public OpKernel {
@@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel {
     TensorShape out_shape;
     for (int i = 0; i < ndims - 2; ++i) {
       OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
-                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
-                                          i, ") must be the same: ",
-                                          in0.shape().DebugString(), " vs ",
-                                          in1.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "In[0].dim(", i, ") and In[1].dim(", i,
+                      ") must be the same: ", in0.shape().DebugString(), " vs ",
+                      in1.shape().DebugString()));
       out_shape.AddDim(in0.dim_size(i));
     }
     auto n = (ndims == 2) ? 1 : out_shape.num_elements();
@@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel {
   bool adj_y_;
 };
 
-#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<CPUDevice, TYPE>)
 
-#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<GPUDevice, TYPE>)
@@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                               \
       Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
       BatchMatMul<SYCLDevice, TYPE>)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 8d155ca62b297a4bf59f62159d6b62b01f777721..7e1e2aa4ec135872993f2e7738c7e863416eee87 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc
index 7923f34155b57cb79894936cb4ea0f485f92d99b..c3932cd7b9023482316807c73bfd52da3a4a3f7a 100644
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_test.cc
@@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
 /* Uncomment to enable benchmarks for double & complex types: */
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu);                    \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);  \
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu);                    \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
+// \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
 
 // Typical fully connected layers
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index d3ed617f713094cb94c1a87dc0c36c3d44d97918..c34ea14bf6007f6951733990c0a01999ac838b75 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index 5e3fcd2114a12709fb306ebadfd21a56b514e0c0..45ddc8532955578b5fca7ea372703f88b6b84f77 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) {
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
   test::FillValues<float>(
       &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 7f2df95e2d55ac93f8a934010244dcbd1dcd28c8..87d455faa709cd44b4a04bceaabe8c9a5b6f884f 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
 namespace tensorflow {
 namespace batch_util {
 
@@ -61,6 +63,21 @@ Status HandleElementToSlice<string>(Tensor element, Tensor* parent, int64 index,
   return Status::OK();
 }
 
+template <>
+Status HandleElementToSlice<Variant>(Tensor element, Tensor* parent,
+                                     int64 index, bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<Variant>();
+  auto element_flat = element.flat<Variant>();
+  if (can_move) {
+    for (int64 i = 0; i < element.NumElements(); ++i) {
+      parent_as_matrix(index, i) = std::move(element_flat(i));
+    }
+  } else {
+    parent_as_matrix.chip(index, 0) = element_flat;
+  }
+  return Status::OK();
+}
+
 // TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
 // the data when possible.
 
@@ -115,5 +132,101 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+// The following five functions are copied from padding_fifo_queue.cc.
+// TODO(mrry): Reconcile these functions with the similar methods in the
+// queue implementation.
+Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
+  DCHECK_NE(parent->dim_size(0), 0);
+  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
+    TensorShape chip_shape = parent->shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
+        "element is greater than number of elements in parent slice.  ",
+        "Shapes are: [element]: ", element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T, int NDIMS>
+Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                  int index) {
+  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
+  if (element.NumElements() == 0) {
+    return Status::OK();
+  }
+  auto element_t = element.tensor<T, NDIMS>();
+  auto parent_t = parent->tensor<T, NDIMS + 1>();
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
+  slice_indices[0] = index;
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
+  slice_size[0] = 1;
+  for (size_t i = 1; i < slice_size.size(); ++i) {
+    slice_size[i] = element_t.dimension(i - 1);
+  }
+  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
+  return Status::OK();
+}
+
+template <int NDIMS>
+Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
+                                          int index) {
+#define HANDLE_TYPE(T)                                                   \
+  case DataTypeToEnum<T>::value: {                                       \
+    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "HandleElementToLargerSliceWithRank Unhandled data type: ",
+          element.dtype());
+  }
+}
+
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index) {
+  if (parent->dims() != element.dims() + 1) {
+    return errors::Internal(
+        "Mismatched ranks.  Element's rank is: ", element.dims(),
+        " but element is meant to be a slice in output Tensor having rank: ",
+        parent->dims(), " (should be: ", element.dims() + 1, ")");
+  }
+
+#define HANDLE_DIMS(NDIMS)                                                  \
+  case NDIMS: {                                                             \
+    TF_RETURN_IF_ERROR(                                                     \
+        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
+    return Status::OK();                                                    \
+  }
+
+  switch (element.dims()) {
+    HANDLE_DIMS(0);
+    HANDLE_DIMS(1);
+    HANDLE_DIMS(2);
+    HANDLE_DIMS(3);
+    HANDLE_DIMS(4);
+#undef HANDLE_DIMS
+    default:
+      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
+                                   element.dims());
+  }
+}
+
+Status SetElementZero(Tensor* element, const Tensor& padding) {
+#define HANDLE_TYPE(T)                                     \
+  if (element->dtype() == DataTypeToEnum<T>::value) {      \
+    element->flat<T>().setConstant(padding.scalar<T>()()); \
+    return Status::OK();                                   \
+  }
+  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("SetElementZero Unhandled data type: ",
+                               element->dtype());
+}
+
 }  // namespace batch_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index 0d634ae7b07ee641eb13167d6f9fcb9ed5f0d974..a47bf1935db611417cea1d98ed8aff496efbf689 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -32,6 +32,16 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Zero-initializes the tensor `element` using the scalar stored in `padding`.
+// Both `element` and `padding` must have matching `dtype`.
+Status SetElementZero(Tensor* element, const Tensor& padding);
+
+// Copies `element` into a (0th dimension) slice of `parent`, assuming
+// the shape of `element` is strictly not larger along any axis than a
+// slice.
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index);
+
 }  // namespace batch_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index dbf1733dcc399522a673e5724dfeb62446f72a0f..36a4019002aa55c26fb5419c7a4d17562a367de8 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -114,7 +114,7 @@ class PeriodicFunction {
   void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_);
 
   const std::function<void()> function_;  // Actual client function
-  const int64 interval_micros_;    // Interval between calls.
+  const int64 interval_micros_;           // Interval between calls.
   const Options options_;
 
   // Protects state below.
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index d73dcf0fa0e1b2b387b3ed53acd63d5c65683fd4..d5ea2b648f35efd03c04d00abc838edadd37570e 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -55,15 +55,14 @@ Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
 // use the clock to be destroyed.
 std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
     test_util::FakeClockEnv* env, Notification* start, Notification* stop) {
-  return std::unique_ptr<Thread>(
-      Env::Default()->StartThread({}, "FakeClockAdvancerThread",
-                                  [env, start, stop] {
-                                    start->WaitForNotification();
-                                    while (!stop->HasBeenNotified()) {
-                                      env->AdvanceByMicroseconds(10);
-                                      Env::Default()->SleepForMicroseconds(10);
-                                    }
-                                  }));
+  return std::unique_ptr<Thread>(Env::Default()->StartThread(
+      {}, "FakeClockAdvancerThread", [env, start, stop] {
+        start->WaitForNotification();
+        while (!stop->HasBeenNotified()) {
+          env->AdvanceByMicroseconds(10);
+          Env::Default()->SleepForMicroseconds(10);
+        }
+      }));
 }
 
 TEST(SharedBatchSchedulerTest, Basic) {
@@ -258,7 +257,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeout) {
 TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) {
   Notification first_batch_processed, second_batch_processed;
   auto callback = [&first_batch_processed, &second_batch_processed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                      std::unique_ptr<Batch<FakeTask>> batch) {
     ASSERT_TRUE(batch->IsClosed());
     if (batch->size() == 1) {
       first_batch_processed.Notify();
@@ -301,7 +300,7 @@ TEST(SharedBatchSchedulerTest,
   {
     Notification first_batch_processed, second_batch_processed;
     auto callback = [&first_batch_processed, &second_batch_processed](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                        std::unique_ptr<Batch<FakeTask>> batch) {
       ASSERT_TRUE(batch->IsClosed());
       if (batch->size() == 1) {
         first_batch_processed.Notify();
@@ -349,7 +348,7 @@ TEST(SharedBatchSchedulerTest, Fairness) {
     auto queue_0_callback = [&queue_0_first_batch_scheduled,
                              &queue_0_first_batch_proceed,
                              &queue_0_second_batch_scheduled](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                                std::unique_ptr<Batch<FakeTask>> batch) {
       if (!queue_0_first_batch_scheduled.HasBeenNotified()) {
         queue_0_first_batch_scheduled.Notify();
         queue_0_first_batch_proceed.WaitForNotification();
@@ -467,7 +466,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
 TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) {
   Notification queue_0_processing, queue_0_proceed;
   auto queue_0_callback = [&queue_0_processing, &queue_0_proceed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                              std::unique_ptr<Batch<FakeTask>> batch) {
     if (!queue_0_processing.HasBeenNotified()) {
       queue_0_processing.Notify();
       queue_0_proceed.WaitForNotification();
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index c1c0d6d329206088acaa009b3ffe695661527e44..b07c5fd718daea802a08650f97ccff393914e208 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
-                           block_dims == orig_crops.dim_size(0) &&
-                           2 == orig_crops.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+                  block_dims == orig_crops.dim_size(0) &&
+                  2 == orig_crops.dim_size(1),
               errors::InvalidArgument("crops should have shape [", block_dims,
                                       ", 2] instead of ",
                                       orig_crops.shape().DebugString()));
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index 7fc4b1762d0e56271bef586f0f8db0a2a66ff87d..8e4f08e473060b50d387d53aab89c10d0a26b93a 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 42f3db1d79d4e0b0406f8c5c9abb423c03f30ab6..754b93b073a36d0925a0339956b8224878b849e1 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -77,14 +77,14 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
   }
   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
   if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size);
+    BiasNHWCKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size);
   } else {
-    BiasNCHWKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size,
-        image_size);
+    BiasNCHWKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size,
+            image_size);
   }
 }
 
@@ -173,19 +173,13 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
   // Accumulate the results in the shared memory into the first element.
   // No syncthreads is needed since this is only in the same warp.
   int32 thread_index = threadIdx.x;
-  if (thread_index < 16) {
-    s_data[thread_index] += s_data[thread_index + 16];
-    __syncwarp(0xFFFF);
-    if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
-    __syncwarp(0xFF);
-    if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
-    __syncwarp(0xF);
-    if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
-    __syncwarp(0x3);
+  if (thread_index < 32) {
+    AccT data = s_data[thread_index];
+    for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
+      data += CudaShuffleXorSync(kCudaWarpAll, data, delta);
+    }
     if (thread_index == 0) {
-      T val = T(s_data[0] + s_data[1]);
-      // The first thread writes out the accumulated result to global location.
-      CudaAtomicAdd(bias_backprop + bias_index, val);
+      CudaAtomicAdd(bias_backprop + bias_index, T(data));
     }
   }
 }
@@ -212,10 +206,10 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
   // Check if we have enough shared memory.
   if (shared_memory_size <= max_shared_memory_size) {
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, shared_memory_size,
-               d.stream()>>>(total_count, output_backprop, bias_backprop,
-                             bias_size);
+      BiasGradNHWC_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, shared_memory_size,
+             d.stream()>>>(total_count, output_backprop, bias_backprop,
+                           bias_size);
     } else {
       // Round up the block count to multiple of bias_size.
       int group_size = (config.block_count + bias_size - 1) / bias_size;
@@ -223,23 +217,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
       if (config.thread_per_block < kWarpSize) {
         config.thread_per_block = kWarpSize;
       }
-      BiasGradNCHW_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          output_backprop, bias_backprop, batch, bias_size, image_size,
-          group_size);
+      BiasGradNCHW_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              output_backprop, bias_backprop, batch, bias_size, image_size,
+              group_size);
     }
   } else {
     // Note that even if we don't have enough shared memory to fit the entire
     // output block, it is possible to process one group of elements at a time.
     // But for now, we simply fall back to the naive implementation.
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size);
+      BiasGradNHWC_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size);
     } else {
-      BiasGradNCHW_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size, image_size);
+      BiasGradNCHW_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size,
+              image_size);
     }
   }
 }
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h
index e35f42ad4173348f63445030aef6c6de2b1de9a7..c8c60c55241ab2b1b3a426560959fed7ea893129 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
   auto *to_x = reinterpret_cast<const volatile T *>(&x);
   return *to_x;
 }
-}  // namespace tensorflow::internal
+}  // namespace internal
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index e937c4f11ba34e16d319b7b4dec317e81b6b8b2c..654d99301af5f528e4360d70edf4cadd4165382d 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
                         SimpleCandidateSamplerOp<LogUniformSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<UnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<UnigramSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
 
 class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
  public:
@@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& in_true_candidates = context->input(0);
     const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
-                             in_true_candidates_shape.dim_size(1) == num_true_,
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+                    in_true_candidates_shape.dim_size(1) == num_true_,
                 errors::InvalidArgument(
                     "true_candidates must be a batch_size * num_true matrix"));
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index f16abb2b79fe24bfbe2711de03c7dfd0847b3003..626db9131aee28be13391ff9c1c92bf9f2d35dd0 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase {
   }
 };
 
-#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                          \
-                              .TypeConstraint<srctype>("SrcT")  \
-                              .TypeConstraint<dsttype>("DstT")  \
-                              .Device(DEVICE_SYCL),             \
+#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
+                              .TypeConstraint<srctype>("SrcT") \
+                              .TypeConstraint<dsttype>("DstT") \
+                              .Device(DEVICE_SYCL),            \
                           SyclCastOp)
 CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
 CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
@@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double);
 
 #undef REGISTER_CAST_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef CURRY_TYPES2
 
@@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
     CpuCastOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 8fedf2c271c2caf60a83fb1f4146dd94821c4643..fd4e75d26f02dc75e13c8781049c904587d10afd 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
     p[0] = a.value;
     p[1] = 0;
 #else
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
+    static_assert(::tensorflow::port::kLittleEndian,
+                  "Not a little endian system!");
     p[0] = 0;
     p[1] = a.value;
 #endif
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 470e9e08041e808f7459b3c654d55b82fde629a9..3ae9f2ab4d9c102941927215441b4c02625387f0 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -41,25 +41,25 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
     o.device(d) = i.template cast<O>();
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
-#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1)   \
-  FN(arg0, arg1, bool);                        \
-  FN(arg0, arg1, uint8);                       \
-  FN(arg0, arg1, int8);                        \
-  FN(arg0, arg1, uint16);                      \
-  FN(arg0, arg1, int16);                       \
-  FN(arg0, arg1, int32);                       \
-  FN(arg0, arg1, int64);                       \
-  FN(arg0, arg1, float);                       \
-  FN(arg0, arg1, double);                      \
-  FN(arg0, arg1, std::complex<float>);         \
+#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
+  FN(arg0, arg1, bool);                      \
+  FN(arg0, arg1, uint8);                     \
+  FN(arg0, arg1, int8);                      \
+  FN(arg0, arg1, uint16);                    \
+  FN(arg0, arg1, int16);                     \
+  FN(arg0, arg1, int32);                     \
+  FN(arg0, arg1, int64);                     \
+  FN(arg0, arg1, float);                     \
+  FN(arg0, arg1, double);                    \
+  FN(arg0, arg1, std::complex<float>);       \
   FN(arg0, arg1, std::complex<double>)
 
-#define CURRY_TYPES3(FN, arg0, arg1)           \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)         \
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
   FN(arg0, arg1, Eigen::half);
 
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index a106f287c1845a108f596b960b65a6392c35b071..057e209a71903ad24e2d4f757e4d2a3bc4357a76 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index ba100b32e7d8cfcd6a0138a09062910743d6d2eb..9cc2e67bbe1f6919d581def55eb4315f7b908ca3 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<CPUDevice, T>);          \
-  template class RGBToHSVOp<CPUDevice, T>;                    \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<CPUDevice, T>);          \
+#define REGISTER_CPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<CPUDevice, T>);                                  \
+  template class RGBToHSVOp<CPUDevice, T>;                        \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<CPUDevice, T>);                                  \
   template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
@@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU);
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-#define DECLARE_GPU(T)                                        \
-  template <>                                                 \
-  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 1>::Tensor range,                             \
-      TTypes<T, 2>::Tensor output_data);                      \
-  extern template struct RGBToHSV<GPUDevice, T>;              \
-  template <>                                                 \
-  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 2>::Tensor output_data);                      \
+#define DECLARE_GPU(T)                                               \
+  template <>                                                        \
+  void RGBToHSV<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
+  extern template struct RGBToHSV<GPUDevice, T>;                     \
+  template <>                                                        \
+  void HSVToRGB<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 2>::Tensor output_data);                             \
   extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-#define REGISTER_GPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<GPUDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<GPUDevice, T>);
+#define REGISTER_GPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          RGBToHSVOp<SYCLDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          HSVToRGBOp<SYCLDevice, T>);
+#define REGISTER_SYCL(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      RGBToHSVOp<SYCLDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      HSVToRGBOp<SYCLDevice, T>);
 TF_CALL_float(REGISTER_SYCL);
 TF_CALL_double(REGISTER_SYCL);
 #endif
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h
index c5721ef6dd067e9df0b1c23ac471667edee06fb3..90bfce14194bb04a3ebe8418fcc4d1beaab4fc2b 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@@ -54,10 +54,9 @@ struct RGBToHSV {
     // TODO(wicke): all these assignments are only necessary because a combined
     // expression is larger than kernel parameter space. A custom kernel is
     // probably in order.
-    H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(
-                                      norm * (B - R) + T(2) / T(6),
-                                      norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (R == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
+                                        norm * (R - G) + T(4) / T(6)));
     H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
     H.device(d) = (H < T(0)).select(H + T(1), H);
   }
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
index e19d0b14d5df5c125c3fb071ea6ae6580fba8c6a..61f9ba44c46f1cee87a72349f8e4ebdd6d2e750f 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/colorspace_op.h"
 
 namespace tensorflow {
 
@@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
-}
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 8c6fb732abf40c52c0a9e9a5c338de859c669838..bd82826770f192acd50ca4212a475881fe5c34fc 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase {
   }
 };
 
-#define TEST_COLORSPACE(test, dt)                               \
-  TEST_F(test, CheckBlack) {                                    \
-    MakeOp(dt);                                                 \
-    CheckBlack(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckGray) {                                     \
-    MakeOp(dt);                                                 \
-    CheckGray(dt);                                              \
-  }                                                             \
-  TEST_F(test, CheckWhite) {                                    \
-    MakeOp(dt);                                                 \
-    CheckWhite(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckRedMax) {                                   \
-    MakeOp(dt);                                                 \
-    CheckRedMax(dt);                                            \
-  }                                                             \
-  TEST_F(test, CheckGreenMax) {                                 \
-    MakeOp(dt);                                                 \
-    CheckGreenMax(dt);                                          \
-  }                                                             \
-  TEST_F(test, CheckBlueMax) {                                  \
-    MakeOp(dt);                                                 \
-    CheckBlueMax(dt);                                           \
-  }                                                             \
-  TEST_F(test, CheckNegativeDifference) {                       \
-    MakeOp(dt);                                                 \
-    CheckNegativeDifference(dt);                                \
+#define TEST_COLORSPACE(test, dt)         \
+  TEST_F(test, CheckBlack) {              \
+    MakeOp(dt);                           \
+    CheckBlack(dt);                       \
+  }                                       \
+  TEST_F(test, CheckGray) {               \
+    MakeOp(dt);                           \
+    CheckGray(dt);                        \
+  }                                       \
+  TEST_F(test, CheckWhite) {              \
+    MakeOp(dt);                           \
+    CheckWhite(dt);                       \
+  }                                       \
+  TEST_F(test, CheckRedMax) {             \
+    MakeOp(dt);                           \
+    CheckRedMax(dt);                      \
+  }                                       \
+  TEST_F(test, CheckGreenMax) {           \
+    MakeOp(dt);                           \
+    CheckGreenMax(dt);                    \
+  }                                       \
+  TEST_F(test, CheckBlueMax) {            \
+    MakeOp(dt);                           \
+    CheckBlueMax(dt);                     \
+  }                                       \
+  TEST_F(test, CheckNegativeDifference) { \
+    MakeOp(dt);                           \
+    CheckNegativeDifference(dt);          \
   }
 
 typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
index 9f626a274a4d36b568cc6e25af2e572a35ae3694..39e4f24ed51fda76dfb21481197644db1cdd7001 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -110,7 +110,20 @@ struct ComputeShard<T,
       typename TTypes<bool>::ConstMatrix input,
       typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
       int64 limit) {
-    // NOTE(ebrevdo): This assumes memory is little-endian.
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+      *out =
+          ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) |
+           (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) |
+           (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) |
+           (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) |
+           (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) |
+           (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) |
+           (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL)))));
+    }
+#else
     for (int64 i = start; i < limit; ++i) {
       uint8* out = output.data() + i;
       const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
@@ -123,6 +136,7 @@ struct ComputeShard<T,
            (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
            (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
     }
+#endif
   }
 };
 
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 526f9420d72fa25ff21bf60b3594649fa1faa0ba..16784c4770eb8626c11dc47104fea3af6c5edc07 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -41,10 +41,11 @@ namespace tensorflow {
 
 // Assumes all inputs are nonempty
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
 #if GOOGLE_CUDA
 template <typename T>
 void ConcatGPU(
@@ -57,11 +58,12 @@ void ConcatGPU(
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
-#endif // TENSORFLOW_USE_SYCL
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 43731114c0b9a87598da19466c0fd9c7e05644bb..fc5a3e62885c92ec16a906df5a6e2d6245ccbbd6 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -48,10 +48,11 @@ struct MemCpyCopier<ResourceHandle> {
 }  // namespace
 
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   if (std::is_same<T, string>::value) {
     // use a large cost here to force strings to be handled by separate threads
     ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
@@ -86,21 +87,22 @@ TF_CALL_variant(REGISTER)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                   output);
+                    output);
 }
-#define REGISTER_SYCL(T)                                                      \
- template void ConcatSYCL<T>(                                                 \
-     const Eigen::SyclDevice&,                                                \
-     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-     typename TTypes<T, 2>::Matrix* output);
+#define REGISTER_SYCL(T)                                                       \
+  template void ConcatSYCL<T>(                                                 \
+      const Eigen::SyclDevice&,                                                \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+      typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 6a933efde4b6ababf35c83c94d233e4aa2552d84..720b5065377b49859fdecc2634d14fe308432fe3 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -15,9 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/concat_lib.h"
 #include <vector>
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -73,7 +73,7 @@ void ConcatCPUImpl(
 
   // Sharded mode.
   auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
-      int64 start, int64 end) {
+                  int64 start, int64 end) {
     int64 skipped_rows = start / row_size;
     T* out = output->data() + skipped_rows * row_size;
     T* out_start = output->data() + start;
@@ -160,5 +160,5 @@ void ConcatSYCLImpl(
     }
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ae1b5da32ea12d94a01ae67563f03dda42d6ead4..7011550f7e161c9727b8d31eff0917964b09044e 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel {
     const TensorShape& input_shape = values[0].shape();
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
-                       (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
           c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == axis) {
           continue;
@@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
             c, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                         ConcatV2Op<CPUDevice, int32>);
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class ConcatOffsetOp : public OpKernel {
  public:
@@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("shape")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index c5bded9dafcdaf7264649e00db2ea2766db8eea9..e3ba8ae9f691c8ec9be79952d7f97801552b2a56 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
 
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
-                         Eigen::Unaligned> EigenMap;
+                         Eigen::Unaligned>
+    EigenMap;
 static void MemcpyManyAlternative1(int iters, int dim2) {
   testing::StopTiming();
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 794ac6fa6de1eb06fcfa614bbfa472814d630d99..c7c7c983691c6f5257622940d183d06304ee74f1 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
  * Modifications to convenience macros defined in core/framework/op_kernel.h.
  * The below macros return a boolean if the test fails, so that the calling
  * function can get an indication that a failure has occurred.
-*/
+ */
 #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS)          \
   do {                                                 \
     if (!TF_PREDICT_TRUE(EXP)) {                       \
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index fa37916eaba4106fe8067b739e77e7f91631b1e9..e13bf8a4c63ebe86fbf3fcf2fdd50f928298d01b 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp
                       ConditionalAccumulatorBase* accumulator,
                       DoneCallback callback) override {
     // Check signature
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                                  {accumulator->dtype()}),
-                         callback);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
+        callback);
   }
 
  private:
@@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                         AccumulatorTakeGradientOp);
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 59f9f69315e1a1b8740ee787fa93df686dfa01d8..920cd87858ab62357d6d65e0d4db4c26d157a75c 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 8fe82d118a702ec6809d6f4f4385fa3dc0949037..7d5d54e5bece7d448e7c11c6061109e9e8554008 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                       \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("pred")        \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_SWITCH(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
 
-#define REGISTER_SYCL_REF_SWITCH(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("pred")          \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_REF_SWITCH(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
 #undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
 
-#define REGISTER_SYCL_HOST_KERNEL(type)                  \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("data")        \
-                              .HostMemory("pred")        \
-                              .HostMemory("output_false")\
-                              .HostMemory("output_true") \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_KERNEL(bool);
 REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(int32);
 
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("data")          \
-                              .HostMemory("pred")          \
-                              .HostMemory("output_false")  \
-                              .HostMemory("output_true")   \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
@@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
+#define REGISTER_SYCL_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(         \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
+#define REGISTER_SYCL_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(             \
       Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
@@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
-  REGISTER_KERNEL_BUILDER(                                                   \
+#define REGISTER_SYCL_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
+  REGISTER_KERNEL_BUILDER(                                                 \
       Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      NextIterationOp);                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNEL(type)                                            \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
+      NextIterationOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       NextIterationOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
@@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond")
                             .HostMemory("input")
                             .HostMemory("output"),
                         LoopCondOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // ControlTrigger kernels
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
@@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL),
                         ControlTriggerOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // When called, abort op will abort the current process. This can be used to
 // abort remote PSs when needed.
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index affa0e8ca6b9d053702f8b203321d6ee2954878e..a2f7bd406929ec516d67dfc76767532cf2bac28c 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -91,6 +91,7 @@ class KilledBySignal {
  public:
   explicit KilledBySignal(int signum) : signum_(signum) {}
   bool operator()(int exit_status) const { return exit_status == signum_; }
+
  private:
   const int signum_;
 };
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 985586d6262b18e89b5fc5246cc00b10ba4924a7..dbddaf3dc640dcf2cad8f6ba7dd00aaa33a30e0c 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -688,7 +688,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
-      );
+  );
 
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 291ebf2298762d25e2d44aa5b82ffd495ea92c0e..1b40ad81f413a726d14c5496f669923ab9254dce 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
 
     const int dims = resized_shape.dims();
     OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
-                     paddings.dim_size(1) == 2,
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 paddings.shape().DebugString()));
     const int fixed_dims =
@@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
       const int32 after =
           paddings_matrix(d, 1);  // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(
-            context, before <= resized_shape.dim_size(d) &&
-                         after <= resized_shape.dim_size(d),
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be no greater "
                                     "than the dimension size: ",
                                     before, ", ", after, " greater than ",
                                     resized_shape.dim_size(d)));
       } else if (offset_ == 1) {  // REFLECT mode.
         OP_REQUIRES(
-            context, before < resized_shape.dim_size(d) &&
-                         after < resized_shape.dim_size(d),
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be less than"
                                     " the dimension size: ",
                                     before, ", ", after, " not less than ",
@@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // We only check the first three dims, since the depth is accessed as an
     // int64 below.
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int padded_rows = static_cast<int>(padded_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
     const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
@@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int padded_cols = static_cast<int>(padded_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
     const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
@@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
 };
 
-#define REGISTER_FUSED(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedResizeAndPadConv2D")                                        \
-          .Device(DEVICE_CPU)                                                \
-          .TypeConstraint<T>("T"),                                           \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          BILINEAR>,                         \
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
           true>);
 
 TF_CALL_float(REGISTER_FUSED);
 
-#define REGISTER_PAD_ONLY_FUSED(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),      \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          NEAREST>,                          \
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
           false>);
 
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 57e196c67cf067bc716d8253f05fc759eaeeba8d..f0085be3a53b71af85d4c5f4bbcc6b07cd982ca8 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -27,7 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index af6013c9747a717b95138c960abcdcc96f4dac73..a376534badc73065e3ec01972dde85da7bbdb0f8 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 
@@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   int x = threadIdx.x;
 
   Dimension<3> output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
   };
 
   Dimension<3> input_dims_in_tiles = {
-      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      input_dims[0],
+      (input_dims[1] + TileSizeI - 1) / TileSizeI,
       (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
   };
 
@@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
       FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
 
   Index<3> input_tile_origin = {
-      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[0],
+      input_tile_index[1] * TileSizeI,
       input_tile_index[2] * TileSizeJ,
   };
 
@@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   __syncthreads();
 
   Index<3> output_tile_index = {
-      input_tile_index[0], input_tile_index[2], input_tile_index[1],
+      input_tile_index[0],
+      input_tile_index[2],
+      input_tile_index[1],
   };
 
   Index<3> output_tile_origin = {
-      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[0],
+      output_tile_index[1] * TileSizeJ,
       output_tile_index[2] * TileSizeI,
   };
 
@@ -641,8 +648,9 @@ struct BatchNarrowMatrixTransposeDispatcher {
     static_assert(
         (TileLongSide & (TileLongSide - 1)) == 0,
         "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
-                             min(tile_size_i, tile_size_j) <= TileShortSide;
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
 
     if (request_satisfied) {
       LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
@@ -655,7 +663,7 @@ struct BatchNarrowMatrixTransposeDispatcher {
     // determine whether it is the long side or the short side that falls short
     // of the request and increase that parameter accordingly.
     const bool long_side_request_not_satisfied =
-        max(tile_size_i, tile_size_j) > TileLongSide;
+        std::max(tile_size_i, tile_size_j) > TileLongSide;
 
     if (long_side_request_not_satisfied) {
       BatchNarrowMatrixTransposeDispatcher<
@@ -683,8 +691,9 @@ struct BatchNarrowMatrixTransposeDispatcher<
     static_assert(
         (TileLongSide & (TileLongSide - 1)) == 0,
         "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
-                             min(tile_size_i, tile_size_j) <= TileShortSide;
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
 
     if (request_satisfied) {
       LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
@@ -799,7 +808,7 @@ struct TransposeElemType<16> {
 // A helper function to make RunSwapDimension1And2InTensor3 concise. This
 // helper function looks at the data type and input matrix sizes and decides
 // the thread numbers and tile sizes to use.
-template <typename T, bool conjugate = false >
+template <typename T, bool conjugate = false>
 void SwapDimension1And2InTensor3WithNarrowMatrices(
     const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
     T* output, const int kMinDimensionToUseTiles) {
@@ -809,7 +818,7 @@ void SwapDimension1And2InTensor3WithNarrowMatrices(
   int tile_long_side_len = 0;
   int tile_short_side_len = 0;
   float lowest_cost = std::numeric_limits<float>::max();
-  int data_long_side = max(input_dims[1], input_dims[2]);
+  int data_long_side = std::max(input_dims[1], input_dims[2]);
 
   for (auto tile_size_pair : tile_spec) {
     int proposed_tile_long_side_len = tile_size_pair.first;
@@ -854,12 +863,14 @@ void SwapDimension1And2InTensor3WithNarrowMatrices(
   // Truncate the shorter size requested according to the manual limit set in
   // tile_spec to make sure that we do not launch configurations violating
   // hardware limits.
-  requested_tile_size_i = requested_tile_size_i == tile_long_side_len
-                              ? tile_long_side_len
-                              : min(requested_tile_size_i, tile_short_side_len);
-  requested_tile_size_j = requested_tile_size_j == tile_long_side_len
-                              ? tile_long_side_len
-                              : min(requested_tile_size_j, tile_short_side_len);
+  requested_tile_size_i =
+      requested_tile_size_i == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j =
+      requested_tile_size_j == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_j, tile_short_side_len);
 
   Dimension<3> input_dims_in_tiles = {
       input_dims[0],
@@ -902,19 +913,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
     constexpr int kNumThreads = 256;
 
     Dimension<3> input_dims_in_tiles = {
-        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        input_dims[0],
+        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
         MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
     };
 
     int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                             input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
+                                          conjugate>
         <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
                                                             output);
 
   } else if (narrow_matrix) {
-    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
-                                                  kMinDimensionToUseTiles);
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
+        d, input, input_dims, output, kMinDimensionToUseTiles);
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 20da77c36f64173f2dd40fe8e4a608e39c128447..af0a9fa82ee5778fa9e18cea59cf759fa468224f 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
diff --git a/tensorflow/core/kernels/cross_op_gpu.cu.cc b/tensorflow/core/kernels/cross_op_gpu.cu.cc
index 7ea0b3be0ca6b8c7df1ba5c311c7949f3672bda1..4a37f6cfbbc4c60e0a2e3cbf280b09acccc0a98c 100644
--- a/tensorflow/core/kernels/cross_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/cross_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/cross_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 73ee3106048f1435f65d435405282574aa0cffda..96bdb6a241b1d88c7b14f22fc618ea9c95fb7642 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -80,16 +80,17 @@ class CTCDecodeHelper {
 
     if (!(batch_size == (*seq_len)->dim_size(0))) {
       return errors::FailedPrecondition(
-          "len(sequence_length) != batch_size.  ", "len(sequence_length):  ",
-          (*seq_len)->dim_size(0), " batch_size: ", batch_size);
+          "len(sequence_length) != batch_size.  ",
+          "len(sequence_length):  ", (*seq_len)->dim_size(0),
+          " batch_size: ", batch_size);
     }
 
     auto seq_len_t = (*seq_len)->vec<int32>();
 
     for (int b = 0; b < batch_size; ++b) {
       if (!(seq_len_t(b) <= max_time)) {
-        return errors::FailedPrecondition("sequence_length(", b, ") <= ",
-                                          max_time);
+        return errors::FailedPrecondition("sequence_length(", b,
+                                          ") <= ", max_time);
       }
     }
 
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb03adb7a5336919c85c4685f4cc7e7a8180892d..b38d838bf1ebdabad85ee3c70a936844f96f106a 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
       const int64 batch_indices = g.group()[0];
       OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
                   errors::InvalidArgument("labels batch index must be between ",
-                                          0, " and ", batch_size, " but saw: ",
-                                          batch_indices));
+                                          0, " and ", batch_size,
+                                          " but saw: ", batch_indices));
 
       auto values = g.values<int32>();
       std::vector<int>* b_values = &labels_t[batch_indices];
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 5fd38d9dc25c13e20766d1fed86c3f7af9912905..1466f24202fea4200f752985d620f1fbea61d35a 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 12cc6c8bdd43b64aa1be2860b54e90aaf5e4c05e..4919122607426f719c660b23baf3a8c7cc38e076 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 39c8814073382566bc3551fdf6d5afc7f1ef0012..c2b355ab7f4fb11cdc89d8f98a8ca1e293818966 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 608a6dce3d223d522776c59a3a1b2ad0d0c14147..bf32c8a54b34586e43d34cf8890ed37fe64b8c34 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
-
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_KERNEL(type)                          \
   REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
@@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::add<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index ac21ca06c929662271ad99b3756b8a22fc62a0cf..e8acbac28533ae36a5af8ce527529927f5fe4129 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          uint8, complex128, string);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
+          complex128, string);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index c28e27d95ae661bdc02a905bb6efd5bdd79f23e5..fe8dfea1173ca6ec6727f2fb475c011176cacad4 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index 0aec6aac3442a98309e352cf1431b920a87f62fe..7cf0405f5244a1a5a7e7e09719da25d0e714a7da 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -1,10 +1,10 @@
-  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-  http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 7d73de48102189f5c0d92ce811fa639ce6ba2cf4..09f0448874f7dc2bc7140e03cbe38d42246c3087 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index 7b688db4c585b0f8d92f289cae598a78df7e379c..6170683fa64bdd50c00c8c774d6a1f137e60fa71 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 0111e9d5fd18f1d94e8d39c5e67d16e04f21e854..816eadc80eb802de46ad4bb22521cbe6a7adf6b2 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index d4b3b0e3935deeded3a0e07bd04056476c4cc29c..71ad0ff0dc2e3031df6177e4d067ad905c23169f 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index bca99a4f897d1cc601a082cc17ca6725929942a2..31b4bb3cadd9b2df5d0ae35b2c8ea4a155278a32 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Cosh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index d44c1bf473e2e778a7d31890a25359e782e1dc94..c71c756e4461d4ed36628ea8a4f8a0922896302c 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 66d7b7d22ebe63bf42da848aa028fcbafc26864b..8f4ac98016cb252c9c952bbc3c67eb2ea3a92f21 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 4f723080060041f1223dbd86aa95f1cc64f5452c..ce03ad5de6285cfa64b56e3e5357e8c916f8baf3 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 5a142b9ce9f8a32fe0569a78452cf710b2317760..d554d41c412bca4a8415852427190fb16f7f8f82 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index fa81ef0872d4ed6545c312b865e305ee430fdccb..fecbf859897bd1560da00f54756d4a1ffb7660d4 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 55f8a30461f16ebd52f27792f2d3b4a05fbf6977..29340b88506147eb9535893939cf28842c671cd9 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
index e7dff5d0ac521cbe6d80efd1f591a9f23a0c650d..77723b3169fa137f0059ffd80a27e84115cb94ca 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-  DEFINE_UNARY1(conj, complex64);
-  DEFINE_UNARY1(conj, complex128);
+DEFINE_UNARY1(conj, complex64);
+DEFINE_UNARY1(conj, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
index 3675398126f3ce13722e41b43f382c7fa1eaf111..26748ef0e724903c95f6665a5d7c00bdbd298a28 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
-               complex64, complex128, bool);
+                complex64, complex128, bool);
 DEFINE_APPROXIMATE_EQUAL2(float, double);
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index a54dbdfc247dfcbba370852f525f0ca686b6c1b4..627ecc8c802a2bbd428f9cc2160bec379d7b654b 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#define EIGEN_USE_GPU
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
 namespace tensorflow {
 namespace functor {
@@ -38,19 +40,17 @@ struct SelectScalarFunctor<GPUDevice, T> {
                   typename TTypes<bool>::ConstScalar cond,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
-
 #if !defined(EIGEN_HAS_INDEX_LIST)
-  Eigen::array<int, 1> rank1{1};
+    Eigen::array<int, 1> rank1{1};
 #else
-  Eigen::IndexList<Eigen::type2index<1>> rank1;
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
 #endif
-  const int size  = then_flat.dimension(0);
-  Eigen::array<int, 1> broadcast_dims{size};
-
-  To32Bit(out).device(d) = cond.reshape(rank1)
-                               .broadcast(broadcast_dims)
-                               .select(then_flat, else_flat);
+    const int size = then_flat.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
 
+    To32Bit(out).device(d) = cond.reshape(rank1)
+                                 .broadcast(broadcast_dims)
+                                 .select(then_flat, else_flat);
   }
 };
 
@@ -89,8 +89,8 @@ struct BatchSelectFunctor<GPUDevice, T> {
   }
 };
 
-#define SELECT_FUNCTOR(T)                      \
-  template struct SelectFunctor<GPUDevice, T>; \
+#define SELECT_FUNCTOR(T)                            \
+  template struct SelectFunctor<GPUDevice, T>;       \
   template struct SelectScalarFunctor<GPUDevice, T>; \
   template struct BatchSelectFunctor<GPUDevice, T>;
 
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index ba89899fb323c58f0a0045f3ef32a897f5f2680a..a4ea40883694540903ac80683d3a7151fac4a583 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 8f0c483aecd7f84bbb8ac47e4c8b5877b40335d4..3f34d6269ef4a1ab0da3dae1d08da037c5507bdd 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
+          double);
 
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .Device(DEVICE_SYCL)
@@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index df2c02e42e17f5bbcb74b637adcfb1dbd5cac3c1..f5cafcc7809aba3facc80f33d417abcfe528a607 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-         uint8, uint16);
+          uint8, uint16);
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 53ec1c1c63f17a03218535c974e591b4eec62a72..ae1e590d24290a397096cbdfdf08b7e2d348f362 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 4b34744304f6c856fb98d39fbadc1e1958c84238..f22ca21e1ca425978b23910c27881eed626626e4 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index ad2dd3f722cebba926dd04748ca146c2ecfc0848..aa180c247e7d01ef0f2898b4a50a71c3c3bc6941 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 136c3666dfc351fa0485eeff060a6ea3a7d48c08..00cdecdbd184b84b6601eda76dd5dfded5aa1e1b 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97a2508d1290c5afe758db9ff54a22a22b6dcac0..11806c5fc774dc3a37abc733127e4b6660f27f9c 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 7fdfdff0e38ea2bfe18acac86b148a4e1e944117..98936e0f960f1f407c2187746ca80d3db0a93412 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 25ad7b24bb1cee3a09c4ea81cccf79b6a4dabeb9..162ca9e07cdc862e04276aca0dce0ad2f4cfc70e 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 87d54e380b4b923f72aff1eb33d56dd7d8a0dd11..8c54f22f10887da8020d8f16d21097fcb002483c 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 442171193bfeb41e8594bf708590fc4d52291685..dff83df828f076a076a8f220d04974344d8ffafc 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 023eb07ca3f52f49c95b5b6450e3417b7cbeabe4..0e8d2e37350dbbb942bd5ed6b16392b6288313fe 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
-          uint8, int32);
+REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
+          int32);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
-           uint8);
+          uint8);
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc
index 7be5857cc06d0f6755d3f4cba2ca67f009740d46..6aa8f8836406ab4f350bc7b6cc1e88bd612ad933 100644
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@@ -22,11 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
-          int8, uint16, int16, int64, complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
+          complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
-           complex64, complex128);
+          complex64, complex128);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index 536891b548f043cb25726d70bfdd362ed0294512..a136769b912718a5749273050a2226da3fa9e3cf 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 7bd81ee12719618181a75907ce547815b1076b84..02cd298745795294bfb8117a24ba930a7f471788 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-           double, uint8, int8, int16);
+          double, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 7d4ecec59f1564c90c11bb05d6e96c7e1b52a60d..05bdea66367c6d525469dd9cdc28b56d3e4c2adc 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
 
 #endif  // GOOGLE_CUDA
 
-#endif   // !defined(__ANDROID_TYPES_SLIM__)
+#endif  // !defined(__ANDROID_TYPES_SLIM__)
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8c0e21f9cf3535dd5f62657de165150f9efcae2e..aee25747b866c910a799b76e3b00b699bef41566 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 3dd9de8d897479456c462ea068c5eda6354b199b..e259daaba47e2d0ab434e47b39376f7b723bdc9d 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
@@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double);
 REGISTER_SELECT_SYCL(int32);
 REGISTER_SELECT_SYCL(int64);
 #undef REGISTER_SELECT_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -201,13 +201,11 @@ struct SelectFunctorBase {
 };
 
 template <typename T>
-struct SelectFunctor<CPUDevice, T>
-        : SelectFunctorBase<CPUDevice, T> {};
+struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct SelectFunctor<SYCLDevice, T>
-        : SelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct SelectScalarFunctorBase {
@@ -222,12 +220,12 @@ struct SelectScalarFunctorBase {
 // CPU Specializations of Select functors with scalar
 template <typename T>
 struct SelectScalarFunctor<CPUDevice, T>
-        : SelectScalarFunctorBase<CPUDevice, T> {};
+    : SelectScalarFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
-        : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@@ -240,8 +238,8 @@ struct BatchSelectFunctorBase {
     const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1);
 
 #if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{ 1, all_but_batch }};
-    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{ batch, 1 }};
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{1, all_but_batch}};
+    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{batch, 1}};
 #else
     Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims;
     broadcast_dims.set(1, all_but_batch);
@@ -257,13 +255,13 @@ struct BatchSelectFunctorBase {
 };
 
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T>
-        : BatchSelectFunctorBase<CPUDevice, T> {};
+struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
+};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
-        : BatchSelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index a76a088ac8f762a1aa980170ba4617b0c66c6e47..c132fdb63f2b8669294de63ec6cb8567002e9bdd 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -35,6 +35,6 @@ REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index a4084d5ad1796f5af1ce1a62e76c9dc6b473586d..02915ff4ce4547516e6e12bc250b605135d70521 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -41,6 +41,6 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::sign<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index b91ff1ac30ba8e7259223e011aa1e70b0a05f623..16c6057864073596592b62f4463cfd1229d3a415 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 055f0b12e14b1e1059600b968584a2ff9924237f..26b7a940aa8dd4fd6ce439eac17b6fd44d0fe3fd 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sinh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 00efbb00f1501669b221682c565b4843c0497128..497756133d05249141823481e6ef43b73a84660b 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -36,5 +36,5 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 07a4b0b084d804c46a8a4a0bc272f78b22d7e845..7fc2f6bf08b2c825f471123e1ab58bd060f6070a 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::square<int32>>);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 6adaecba04bfcf1b42a760d712eece493131ade2..025041946ac71f0e8f4724f9432d5e2901e348cc 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -53,5 +53,5 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::sub<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 7891b1183dd56b9809ef7f5dc76c3f04fe605b02..c1a25767d3146abc43442cc25b48378c74f8e984 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 8b3900892c300ee266b1a7fb066ef79c88c3d087..c5005f5ea8aa3e0b392bd038983d1658c8c56520 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -26,7 +26,7 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index e561e59cf5a23d6d4881c7c5fcf289ccff4c21cb..980edffceb35ee3f3d7f3557093baec1487a9b5a 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -57,9 +57,9 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
       in1(ctx->input(1)),
       bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
   if (!bcast.IsValid()) {
-    ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
-                                           in0.shape().DebugString(), " vs. ",
-                                           in1.shape().DebugString()));
+    ctx->SetStatus(errors::InvalidArgument(
+        "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
+        in1.shape().DebugString()));
     return;
   }
   const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
index 439477070893d37a9fcb7b662e379cce2955b07a..e81b840a509ada73e62a763b203763d9e4e65363 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -50,16 +50,16 @@ struct SimpleBinaryFunctor<GPUDevice, Functor> {
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for SimpleBiaryFunctor (e.g., functor::tanh_grad).
-#define DEFINE_SIMPLE_BINARY1(F, T)                  \
+#define DEFINE_SIMPLE_BINARY1(F, T) \
   template struct SimpleBinaryFunctor<GPUDevice, F<T> >
-#define DEFINE_SIMPLE_BINARY2(F, T0, T1)             \
-  DEFINE_SIMPLE_BINARY1(F, T0);                      \
+#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \
+  DEFINE_SIMPLE_BINARY1(F, T0);          \
   DEFINE_SIMPLE_BINARY1(F, T1)
-#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2)         \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);          \
   DEFINE_SIMPLE_BINARY1(F, T2)
-#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3)     \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);              \
   DEFINE_SIMPLE_BINARY2(F, T2, T3)
 #define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \
   DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 77b330f5899815d5784659515e43ee497bdca58e..82cdae9a348aaf3625e1e4cf9f80ea7768694062 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
-
 #ifdef TENSORFLOW_USE_SYCL
 // Partial specialization of BinaryFunctor for SYCL devices
 typedef Eigen::SyclDevice SYCLDevice;
@@ -184,7 +183,7 @@ struct SimpleBinaryFunctor<SYCLDevice, Functor> {
   }
 };
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index 3f6ff7303d627ca64abd0f93658bf1b40ce4d71e..3e107cee04c787d71326bbe4799565f8609f6f4e 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
   void operator()(const SYCLDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in0,
                   typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
+    To32Bit(out).device(d) =
+        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
   }
 
   void Left(const SYCLDevice& d, typename Functor::tout_type out,
@@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary());
+    out.device(d) = scalar.reshape(scalar_dim)
+                        .broadcast(in.dimensions())
+                        .binaryExpr(in, Binary());
   }
 
   void Right(const SYCLDevice& d, typename Functor::tout_type out,
@@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
+    out.device(d) = in.binaryExpr(
+        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
   }
 
   void BCast(const SYCLDevice& d,
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index bca0f1004d5f41fd3c8fd8b4eebd44c981053520..39f497e71612fc08a085e410edae73669fc9993a 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -54,36 +54,36 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, float, DT_FLOAT);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, double, DT_DOUBLE);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 // data func scalar.
 Graph* BinaryScalar(int num, const string& func) {
@@ -113,18 +113,18 @@ Graph* BinaryScalar(int num, const string& func) {
 BM_BINARY_SCALAR(cpu, Less);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Less);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, Add);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -163,11 +163,11 @@ using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
@@ -217,15 +217,15 @@ using Eigen::half;
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
@@ -265,10 +265,10 @@ Graph* BcastAdd(int rows, int cols, int dim) {
 BM_BCAST_ADD_ROW_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_ROW_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -291,10 +291,10 @@ BM_BCAST_ADD_ROW_ALL(sycl);
 BM_BCAST_ADD_COL_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_COL_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 500ee7b43f2fbd730ae38c3820ed28ec67b9036c..c4e21257ffc4c14cac2cadd6dcae14f0900183e1 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -49,6 +49,7 @@ cc_library(
     srcs = ["dataset.cc"],
     hdrs = ["dataset.h"],
     deps = [
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -81,9 +82,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
@@ -122,6 +121,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -402,6 +402,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "tensor_queue_dataset_op",
+    srcs = ["tensor_queue_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
@@ -540,6 +553,7 @@ tf_kernel_library(
         ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
+        ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
         ":unique_dataset_op",
         ":zip_dataset_op",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 2d6e06398f66c0b07ae17d4fd25d7ba6b5cfef03..7fa67efb9e22e6877b97524150b9024521619dbc 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -92,7 +92,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
@@ -145,7 +144,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           const Tensor& first_element = batch_elements[0][component_index];
           TensorShape batch_component_shape({num_batch_elements});
           batch_component_shape.AppendShape(first_element.shape());
-          Tensor batch_component(cpu_allocator(), first_element.dtype(),
+          Tensor batch_component(ctx->allocator({}), first_element.dtype(),
                                  batch_component_shape);
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 1f6d32f8df39948a4529bdf53091ff742ba88edb..f3e4f1cd3fd27c79eec4379dcd79472bde7ab5ea 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/notification.h"
 
-
 namespace tensorflow {
 
 /* static */
@@ -185,8 +184,7 @@ Status CapturedFunction::MaybeInstantiate(
   return Status::OK();
 }
 
-Status CapturedFunction::Run(IteratorContext* ctx,
-                             std::vector<Tensor>&& args,
+Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
                              std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle));
diff --git a/tensorflow/core/kernels/data/dataset.cc b/tensorflow/core/kernels/data/dataset.cc
index 2ea6875567604e4e5bf7c990ad6a42ed8c5dafaa..d18cb160189e832592b2bfdf7769396010859cc6 100644
--- a/tensorflow/core/kernels/data/dataset.cc
+++ b/tensorflow/core/kernels/data/dataset.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 
@@ -264,6 +265,10 @@ void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
   MakeDataset(ctx, input, another_input, output);
 }
 
+Allocator* IteratorContext::allocator(AllocatorAttributes attrs) {
+  return params_.lib->device()->GetAllocator(attrs);
+}
+
 const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index 2ef31ddfaaa2fd1bd6a4898726d788d1ceece82e..08c3ca82eab5c79ced6e8de7fc02f88782d0564a 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -272,6 +272,9 @@ class IteratorContext {
     // The FunctionLibraryRuntime object to be used to make function calls.
     FunctionLibraryRuntime* lib = nullptr;
     std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
+
+    // The Allocator to be used to allocate the output of an iterator.
+    Allocator* allocator = nullptr;
   };
 
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
@@ -298,6 +301,8 @@ class IteratorContext {
 
   void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
 
+  Allocator* allocator(AllocatorAttributes attrs);
+
  private:
   Params params_;
 };
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index e7224bb547f60f943c7c91c37edfbbf561f5351a..132808a5f140a31fc3c1852cb83e5cd8579b6d95 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -155,7 +155,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // Determine the size of the output tensors:
         // * dense_shape will be [`row_shape + 1`].
-        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
+        Tensor dense_shape(ctx->allocator({}), DT_INT64, {row_ndims + 1});
         auto dense_shape_vec = dense_shape.vec<int64>();
         for (size_t i = 0; i < row_ndims; ++i) {
           if (row_shape.dim_size(i) == -1) {
@@ -215,10 +215,10 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // * indices will be [`total_elements`, `row_shape + 1`].
         // * values will be [`total_elements`].
-        Tensor indices(cpu_allocator(), DT_INT64,
+        Tensor indices(ctx->allocator({}), DT_INT64,
                        {total_elements, row_ndims + 1});
         Tensor values(
-            cpu_allocator(),
+            ctx->allocator({}),
             DatasetIterator<Dataset<T>>::dataset()->input_->output_dtypes()[0],
             {total_elements});
         auto indices_matrix = indices.matrix<int64>();
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index eb047e10ecf738c90c18b9fea25f1b49fdf441c4..834c06bb930d1c723c5b3f880dcc13a892bb44f7 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
-
 namespace {
 
 // See documentation in ../ops/dataset_ops.cc for a high-level
@@ -510,10 +509,6 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
-    // A resource name for the temporary window dataset that is
-    // created as the input to the reduce function.
-    static constexpr const char* kWindowResourceName = "__window_dataset";
-
     const DatasetBase* const input_;
     const NameAttrList key_func_;
     const NameAttrList reduce_func_;
@@ -537,5 +532,4 @@ REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
                         GroupByWindowDatasetOp);
 
 }  // namespace
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 56044a3d41a9f8f2af3c3a72344845e3a59151af..dd5f4a4554b619fb6c3c4851163d93f1fad632ea 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -82,7 +83,7 @@ class IteratorResource : public ResourceBase {
  public:
   IteratorResource(const DataTypeVector& output_dtypes,
                    const std::vector<PartialTensorShape>& output_shapes,
-                   const int graph_def_version,
+                   const int /*unused: graph_def_version*/,
                    std::unique_ptr<DeviceMgr> device_mgr,
                    std::unique_ptr<FunctionLibraryDefinition> flib_def,
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
@@ -93,8 +94,7 @@ class IteratorResource : public ResourceBase {
         lib_(lib),
         iterator_(nullptr),
         output_dtypes_(output_dtypes),
-        output_shapes_(output_shapes),
-        graph_def_version_(graph_def_version) {}
+        output_shapes_(output_shapes) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
@@ -223,7 +223,6 @@ class IteratorResource : public ResourceBase {
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
-  const int graph_def_version_;
 };
 
 // Helper class for reading data from a VariantTensorData object.
@@ -430,13 +429,10 @@ class IteratorStateVariant {
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
                                        kIteratorVariantTypeName);
 
-// TODO(mrry): Can we simply use the template kernel here?
 class IteratorHandleOp : public OpKernel {
  public:
   explicit IteratorHandleOp(OpKernelConstruction* ctx)
       : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}),
-                                                 &handle_, nullptr));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
@@ -460,56 +456,51 @@ class IteratorHandleOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    FunctionLibraryRuntime* lib = context->function_library();
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
-    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-    // If the iterator is shared then we construct a new FLR, and pass that in.
-    // NOTE(mrry,rohanj): In this case it is not possible to call remote
-    // functions from the iterator. We may add this functionality if there
-    // is sufficient demand, but it will require a significant refactoring.
-    if (!name_.empty()) {
-      lib = CreateFLR(context, &device_mgr, &flib_def, &pflr);
-    }
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        FunctionLibraryRuntime* lib = context->function_library();
+        std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+        // If the iterator is shared then we construct a new FLR, and pass that
+        // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
+        // functions from the iterator. We may add this functionality if there
+        // is sufficient demand, but it will require a significant refactoring.
+        if (!name_.empty()) {
+          lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
+        }
 
-    if (resource_ == nullptr) {
-      ResourceMgr* mgr = context->resource_manager();
-      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        IteratorResource* resource;
+        OP_REQUIRES_OK(
+            context,
+            mgr->LookupOrCreate<IteratorResource>(
+                cinfo_.container(), cinfo_.name(), &resource,
+                [lib, &device_mgr, &flib_def, &pflr,
+                 this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  *ret = new IteratorResource(
+                      output_dtypes_, output_shapes_, graph_def_version_,
+                      std::move(device_mgr), std::move(flib_def),
+                      std::move(pflr), lib);
+                  return Status::OK();
+                }));
+
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
 
-      IteratorResource* resource;
-      OP_REQUIRES_OK(
-          context,
-          mgr->LookupOrCreate<IteratorResource>(
-              cinfo_.container(), cinfo_.name(), &resource,
-              [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
-                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                    *ret = new IteratorResource(
-                        output_dtypes_, output_shapes_, graph_def_version_,
-                        std::move(device_mgr), std::move(flib_def),
-                        std::move(pflr), lib);
-                    return Status::OK();
-                  }));
-
-      Status s = VerifyResource(resource);
-      if (TF_PREDICT_FALSE(!s.ok())) {
-        resource->Unref();
-        context->SetStatus(s);
-        return;
+        resource_ = resource;
       }
-
-      auto h = handle_.AccessTensor(context)->template flat<string>();
-      h(0) = cinfo_.container();
-      h(1) = cinfo_.name();
-      resource_ = resource;
-    }
-    if (context->expected_output_dtype(0) == DT_RESOURCE) {
-      OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                  context, 0, cinfo_.container(), cinfo_.name(),
-                                  MakeTypeIndex<IteratorResource>()));
-    } else {
-      context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<IteratorResource>()));
   }
 
  private:
@@ -526,15 +517,32 @@ class IteratorHandleOp : public OpKernel {
     return Status::OK();
   }
 
-  FunctionLibraryRuntime* CreateFLR(
+  template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+  static inline To down_cast(From* f) {  // so we only accept pointers
+    static_assert(
+        (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+        "target type not derived from source type");
+
+    // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+    // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+    assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+    return static_cast<To>(f);
+  }
+
+  FunctionLibraryRuntime* CreatePrivateFLR(
       OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
       std::unique_ptr<FunctionLibraryDefinition>* flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
-    Device* device = new ThreadPoolDevice(
-        SessionOptions(), ctx->device()->attributes().name(), Bytes(256 << 20),
-        DeviceLocality(), cpu_allocator());
-
-    device_mgr->reset(new DeviceMgr({device}));
+    // Wrap the existing device in order to see any captured resources
+    // in its resource manager. The existing device will outlive the
+    // IteratorResource, because we are storing the IteratorResource
+    // in that device's resourc manager.
+    Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+        ctx->device()->name(), down_cast<Device*>(ctx->device()),
+        false /* owns_underlying */, false /* isolate_session_state */);
+    device_mgr->reset(new DeviceMgr({wrapped_device}));
     flib_def->reset(new FunctionLibraryDefinition(
         *ctx->function_library()->GetFunctionLibraryDefinition()));
     pflr->reset(new ProcessFunctionLibraryRuntime(
@@ -542,13 +550,12 @@ class IteratorHandleOp : public OpKernel {
         {} /* TODO(mrry): OptimizerOptions? */,
         nullptr /* TODO(mrry): ClusterFLR */));
 
-    return (*pflr)->GetFLR(device->name());
+    return (*pflr)->GetFLR(ctx->device()->name());
   }
 
   mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
   IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
-  PersistentTensor handle_ GUARDED_BY(mu_);
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index c529f671f2bb7fd3eb5277c23867e25ba70fd046..9ce263732f6e6c907dfdc89692455daa5dca86d1 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -183,7 +183,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               TensorShape component_shape(
                   batch_results_[current_batch_index_].output[i].shape());
               component_shape.set_dim(0, num_elements);
-              Tensor component(cpu_allocator(), output[i].dtype(),
+              Tensor component(ctx->allocator({}), output[i].dtype(),
                                component_shape);
               TF_RETURN_IF_ERROR(
                   CopyPartialBatch(&component, output[i], num_elements));
@@ -244,7 +244,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      void EnsureOutputAllocated(BatchResult* batch_result,
+      void EnsureOutputAllocated(IteratorContext* ctx,
+                                 BatchResult* batch_result,
                                  const std::vector<Tensor>& return_values) {
         mutex_lock l(batch_result->mu);
         if (batch_result->output_allocated) {
@@ -254,7 +255,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
           component_shape.AppendShape(return_values[i].shape());
-          Tensor component(cpu_allocator(), return_values[i].dtype(),
+          Tensor component(ctx->allocator({}), return_values[i].dtype(),
                            component_shape);
           batch_result->output.emplace_back(std::move(component));
         }
@@ -285,10 +286,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               dataset()->captured_func_->RunAsync(
                   ctx, std::move(input_element), &result->return_values,
                   [this, ctx, result, batch_result, offset](Status ret_status) {
-                    delete ctx;
                     result->status.Update(ret_status);
                     if (ret_status.ok()) {
-                      EnsureOutputAllocated(batch_result,
+                      EnsureOutputAllocated(ctx, batch_result,
                                             result->return_values);
                       const size_t num_components =
                           result->return_values.size();
@@ -318,6 +318,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                         }
                       }
                     }
+                    delete ctx;
                     // NOTE(mrry): We clear the return values here to release
                     // any memory associated with them and to paralellize the
                     // destruction of the tensors (which can be surprisingly
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 346eca0bb2ab1c7a82ddba98063c0ccb71b4e58f..cfb4efda9a56fde04994201f509cf3d9fb45ea82 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
@@ -24,102 +25,6 @@ namespace {
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-// The following five functions are copied from padding_fifo_queue.cc.
-// TODO(mrry): Reconcile these functions with the similar methods in the
-// queue implementation.
-Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
-  DCHECK_NE(parent->dim_size(0), 0);
-  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
-    TensorShape chip_shape = parent->shape();
-    chip_shape.RemoveDim(0);
-    return errors::Internal(
-        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
-        "element is greater than number of elements in parent slice.  ",
-        "Shapes are: [element]: ", element.shape().DebugString(),
-        ", [parent slice]: ", chip_shape.DebugString());
-  }
-  return Status::OK();
-}
-
-template <typename T, int NDIMS>
-Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                  int index) {
-  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
-  if (element.NumElements() == 0) {
-    return Status::OK();
-  }
-  auto element_t = element.tensor<T, NDIMS>();
-  auto parent_t = parent->tensor<T, NDIMS + 1>();
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
-  slice_indices[0] = index;
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
-  slice_size[0] = 1;
-  for (size_t i = 1; i < slice_size.size(); ++i) {
-    slice_size[i] = element_t.dimension(i - 1);
-  }
-  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
-  return Status::OK();
-}
-
-template <int NDIMS>
-Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
-                                          int index) {
-#define HANDLE_TYPE(T)                                                   \
-  case DataTypeToEnum<T>::value: {                                       \
-    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
-  }
-
-  switch (element.dtype()) {
-    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-    default:
-      return errors::Unimplemented(
-          "HandleElementToLargerSliceWithRank Unhandled data type: ",
-          element.dtype());
-  }
-}
-
-Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                int index) {
-  if (parent->dims() != element.dims() + 1) {
-    return errors::Internal(
-        "Mismatched ranks.  Element's rank is: ", element.dims(),
-        " but element is meant to be a slice in output Tensor having rank: ",
-        parent->dims(), " (should be: ", element.dims() + 1, ")");
-  }
-
-#define HANDLE_DIMS(NDIMS)                                                  \
-  case NDIMS: {                                                             \
-    TF_RETURN_IF_ERROR(                                                     \
-        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
-    return Status::OK();                                                    \
-  }
-
-  switch (element.dims()) {
-    HANDLE_DIMS(0);
-    HANDLE_DIMS(1);
-    HANDLE_DIMS(2);
-    HANDLE_DIMS(3);
-    HANDLE_DIMS(4);
-#undef HANDLE_DIMS
-    default:
-      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
-                                   element.dims());
-  }
-}
-
-Status SetElementZero(Tensor* element, const Tensor& padding) {
-#define HANDLE_TYPE(T)                                     \
-  if (element->dtype() == DataTypeToEnum<T>::value) {      \
-    element->flat<T>().setConstant(padding.scalar<T>()()); \
-    return Status::OK();                                   \
-  }
-  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("SetElementZero Unhandled data type: ",
-                               element->dtype());
-}
-
 class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx)
@@ -376,20 +281,27 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
           // 2. Copy each batch element to the appropriate location in
           // the output component tensor.
-          Tensor batch_component(cpu_allocator(),
+          Tensor batch_component(ctx->allocator({}),
                                  output_dtypes()[component_index],
                                  batch_component_shape);
-          TF_RETURN_IF_ERROR(SetElementZero(
+          TF_RETURN_IF_ERROR(batch_util::SetElementZero(
               &batch_component, dataset()->padding_values_[component_index]));
 
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
+          TensorShape component_shape({});
+          for (int i = 1; i < batch_component_shape.dims(); ++i) {
+            component_shape.AddDim(batch_component_shape.dim_size(i));
+          }
           for (int64 i = 0; i < num_batch_elements; ++i) {
-            TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component));
-
-            TF_RETURN_IF_ERROR(CopyElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component, i));
+            // Take the fast path if possible.
+            if (batch_elements[i][component_index].shape() == component_shape) {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            }
           }
           out_tensors->push_back(std::move(batch_component));
         }
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
index bc638864b0147f4d71b3382ea320453e972ba8d7..210b9ad1b84eeb0c106b0ee538b4957aba7ce1b2 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -99,7 +99,7 @@ class RandomDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
         value_tensor.scalar<int64>()() = Random();
         out_tensors->emplace_back(std::move(value_tensor));
         *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index d0bc61acd99afae14ddc8a3e678acb4197fcea71..b57518e678ed185a183e0413d6e90f2a9f85e9fc 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -100,7 +100,7 @@ class RangeDatasetOp : public DatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
         value_tensor.scalar<int64>()() = next_;
         out_tensors->emplace_back(std::move(value_tensor));
         *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index aa39fffc2e344db8143b700cbba4c29bdb134964..34d7d9f914d7a726135febabb1fbe35b0146977c 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -141,7 +141,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
-              Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor line_tensor(ctx->allocator({}), DT_STRING, {});
               line_tensor.scalar<string>()() = line_contents;
               out_tensors->emplace_back(std::move(line_tensor));
               *end_of_sequence = false;
@@ -384,7 +384,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
               // Produce the record as output.
-              Tensor record_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
               record_tensor.scalar<string>()() = record;
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
@@ -589,7 +589,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         do {
           // We are currently processing a file, so try to read the next record.
           if (reader_) {
-            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
+            Tensor result_tensor(ctx->allocator({}), DT_STRING, {});
             Status s = reader_->ReadRecord(&result_tensor.scalar<string>()());
             if (s.ok()) {
               out_tensors->emplace_back(std::move(result_tensor));
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 13c2501bbbd43bdb6c3c521db4c3830934ee91db..d636c37afe2aa0566df7d4a38a8d393c34fd0195 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -128,8 +128,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         while (i_ < dataset()->count_) {
           // Fetch and throw away Tensors.
           std::vector<Tensor> dummy_out_tensors;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &dummy_out_tensors,
-                                                  end_of_sequence));
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &dummy_out_tensors, end_of_sequence));
           if (*end_of_sequence) {
             // We reached the end before the count was reached.
             input_impl_.reset();
@@ -140,8 +140,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         }
 
         // Return GetNext() on the underlying iterator.
-        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
-                                                end_of_sequence));
+        TF_RETURN_IF_ERROR(
+            input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
         if (*end_of_sequence) {
           input_impl_.reset();
         }
@@ -184,8 +184,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU),
-                        SkipDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU), SkipDatasetOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
index 0286825af3ef7c04fff6911ddf7daec76479a715..f4698bdaf7ae9767e068e49dad61d2a3d9f739a8 100644
--- a/tensorflow/core/kernels/data/sql/BUILD
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -33,6 +33,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset",
         "//tensorflow/core/lib/db:sqlite",
     ],
 )
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/sql/query_connection.h
index f31017bd1981c3809d9b7daaa2dc56256d19d914..e9ffca202ff32f0c0130427c2699ce0449a0903a 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/sql/query_connection.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class IteratorContext;
+
 namespace sql {
 // This interface allows a user to connect to a database, execute a query, and
 // iterate over the result set, putting the results into an output tensor.
@@ -56,7 +58,7 @@ class QueryConnection {
   // If there are no more rows in the result set, then instead `true` will be
   // stored in `*end_of_sequence`, and the content of `*out_tensors` will be
   // undefined.
-  virtual Status GetNext(std::vector<Tensor>* out_tensors,
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) = 0;
 };
 
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
index 029a0aab97290e30783e415274323a1e43f9740b..7cd07bd8eca160bfc62e15adc568742c84711779 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
@@ -48,14 +49,16 @@ Status SqliteQueryConnection::Close() {
   return Status::OK();
 }
 
-Status SqliteQueryConnection::GetNext(std::vector<Tensor>* out_tensors,
+Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
+                                      std::vector<Tensor>* out_tensors,
                                       bool* end_of_sequence) {
   if (!stmt_) TF_RETURN_IF_ERROR(PrepareQuery());
   TF_RETURN_IF_ERROR(stmt_.Step(end_of_sequence));
   if (!*end_of_sequence) {
     for (int i = 0; i < column_count_; i++) {
       DataType dt = output_types_[i];
-      Tensor tensor(cpu_allocator(), dt, {});
+      // TODO(mrry): Pass in the `IteratorContext::allocator()`.
+      Tensor tensor(ctx->allocator({}), dt, {});
       FillTensorWithResultSetEntry(dt, i, &tensor);
       out_tensors->emplace_back(std::move(tensor));
     }
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
index 787c17d6c00d99afad3d7814c3c2daaf4295b1b3..81b19530b7d5964e17bde996de9fa7766af318b7 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
@@ -32,7 +32,7 @@ class SqliteQueryConnection : public QueryConnection {
   Status Open(const string& data_source_name, const string& query,
               const DataTypeVector& output_types) override;
   Status Close() override;
-  Status GetNext(std::vector<Tensor>* out_tensors,
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) override;
 
  private:
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index 72302190802d17f2cb1ed5471017180238aedff3..d50e9c9cf9739044379c7bbe753fc4acc2de311e 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -116,7 +116,7 @@ class SqlDatasetOp : public DatasetOpKernel {
         }
       }
 
-      Status GetNextInternal(IteratorContext* /*ctx*/,
+      Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
@@ -132,7 +132,7 @@ class SqlDatasetOp : public DatasetOpKernel {
             return s;
           }
         }
-        return query_connection_->GetNext(out_tensors, end_of_sequence);
+        return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff412a4671bd0307e4975027ebd1e098353de238
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -0,0 +1,646 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <deque>
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
+                                      const PartialTensorShape& b) {
+  // Returns true if dims[a] >= dims[b], or are compatible.
+  if (a.unknown_rank()) return true;
+  if (a.dims() != b.dims()) return false;
+  for (int d = 0; d < a.dims(); ++d) {
+    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
+    if (a.dim_size(d) < b.dim_size(d)) return false;
+  }
+  return true;
+}
+
+DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
+  DataTypeVector out;
+  out.reserve(dtypes.size() + 1);
+  out.push_back(DT_VARIANT);  // The queue component.
+  for (const DataType& d : dtypes) out.push_back(d);
+  return out;
+}
+
+std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
+    const std::vector<PartialTensorShape>& shapes) {
+  std::vector<PartialTensorShape> out;
+  out.reserve(shapes.size() + 1);
+  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
+  for (PartialTensorShape s : shapes) {
+    s.InsertDim(0, -1);  // Unknown batch size.
+    out.push_back(std::move(s));
+  }
+  return out;
+}
+
+class EnqueueInQueueDatasetOp;
+
+class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
+ public:
+  PrependFromQueueAndPaddedBatchDataset(
+      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
+      const DataTypeVector& dtypes,
+      const std::vector<PartialTensorShape>& shapes,
+      std::vector<Tensor> padding_values)
+      : GraphDatasetBase(ctx),
+        batch_size_(batch_size),
+        input_(input),
+        dtypes_(dtypes),
+        shapes_(shapes),
+        padding_values_(std::move(padding_values)),
+        dtypes_with_queue_(PrependQueueType(dtypes)),
+        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
+    input_->Ref();
+  }
+
+  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(
+        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return dtypes_with_queue_;
+  }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return batched_shapes_with_queue_;
+  }
+
+  string DebugString() override {
+    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
+  }
+
+ protected:
+  Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph = nullptr;
+    TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
+    Node* batch_size = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
+
+    std::vector<Node*> padded_shapes;
+    padded_shapes.reserve(shapes_.size());
+    for (int i = 0; i < shapes_.size(); i++) {
+      Node* node;
+      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
+      for (int j = 0; j < shapes_[i].dims(); j++) {
+        t.vec<int64>()(j) = shapes_[i].dim_size(j);
+      }
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padded_shapes.emplace_back(node);
+    }
+
+    std::vector<Node*> padding_values;
+    padding_values.reserve(padding_values_.size());
+    for (const Tensor& t : padding_values_) {
+      Node* node;
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padding_values.emplace_back(node);
+    }
+
+    AttrValue output_types;
+    b->BuildAttrValue(dtypes_, &output_types);
+
+    AttrValue output_shapes;
+    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
+
+    AttrValue N;
+    b->BuildAttrValue<int64>(shapes_.size(), &N);
+
+    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
+                                     {{2, padded_shapes}, {3, padding_values}},
+                                     {{"Toutput_types", output_types},
+                                      {"output_shapes", output_shapes},
+                                      {"N", N}},
+                                     output));
+
+    return Status::OK();
+  }
+
+ private:
+  friend class EnqueueInQueueDatasetOp;
+
+  class Iterator
+      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params),
+          queue_(new TensorQueue(/*input_impl*/
+                                 params.dataset->input_->MakeIterator(
+                                     params.prefix),
+                                 params.dataset->dtypes_,
+                                 params.dataset->shapes_)) {}
+
+    ~Iterator() override { queue_->Unref(); }
+
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      std::vector<std::vector<Tensor>> batch;
+      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
+                                         end_of_sequence));
+      const auto& dtypes = dataset()->dtypes_;
+      const auto& shapes = dataset()->shapes_;
+      const auto& input_shapes = dataset()->input_->output_shapes();
+      const auto& padding_values = dataset()->padding_values_;
+      const int64 batch_size = batch.size();
+      out_tensors->reserve(dtypes.size());
+
+      std::vector<TensorShape> max_shapes;  // Of non-queue components.
+      for (int i = 0; i < dtypes.size(); ++i) {
+        const PartialTensorShape& shape = shapes[i];
+        TensorShape out_shape({batch_size});
+        for (int r = 0; r < shape.dims(); ++r) {
+          if (shape.dim_size(r) >= 0) {
+            // padded_shape[r] is known.
+            out_shape.AddDim(shape.dim_size(r));
+          } else {
+            // padded_shape[r] is unknown, find the maximum across
+            // the batch.
+            int64 dim = 0;
+            for (int b = 0; b < batch.size(); ++b) {
+              dim = std::max(dim, batch[b][i].dim_size(r));
+            }
+            out_shape.AddDim(dim);
+          }
+        }
+        max_shapes.push_back(std::move(out_shape));
+      }
+
+      Tensor queues_t(cpu_allocator(), DT_VARIANT, TensorShape({batch_size}));
+      if (!batch.empty()) {
+        auto queues = queues_t.flat<Variant>();
+        Variant& queue_inserter = queues(0);
+        queue_inserter = TensorQueueInserter();
+        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
+        for (int b = 1; b < batch.size(); ++b) {
+          // Copy the TensorQueueInserter.  Each copy increments the
+          // Ref on the queue_.
+          queues(b) = queues(0);
+        }
+      }
+      out_tensors->push_back(std::move(queues_t));
+
+      for (int i = 0; i < max_shapes.size(); ++i) {
+        Tensor component(cpu_allocator(), dtypes[i], max_shapes[i]);
+        // Try hard to take the fast path.
+        if (shapes[i].IsFullyDefined() &&
+            shapes[i].IsIdenticalTo(input_shapes[i])) {
+          // Take the fast path if we know all the shapes statically.
+          for (int64 b = 0; b < batch.size(); ++b) {
+            TF_RETURN_IF_ERROR(
+                batch_util::CopyElementToSlice(batch[b][i], &component, b));
+          }
+        } else {
+          TF_RETURN_IF_ERROR(
+              batch_util::SetElementZero(&component, padding_values[i]));
+          for (int64 b = 0; b < batch.size(); ++b) {
+            if (batch[b][i].shape() == max_shapes[i]) {
+              TF_RETURN_IF_ERROR(
+                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch[b][i], &component, b));
+            }
+          }
+        }
+        out_tensors->push_back(std::move(component));
+      }
+
+      // end_of_sequence was set before we populated out_tensors, so
+      // it's ok to return now.
+      return Status::OK();
+    }
+
+   protected:
+    // Work around bug in MSVC that disallows access to protected
+    // members of Iterator from within TensorQueue.
+    class TensorQueue;
+    friend class TensorQueue;
+
+    class TensorQueue : public core::RefCounted {
+     public:
+      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
+                  const DataTypeVector& dtypes,
+                  const std::vector<PartialTensorShape>& shapes)
+          : dtypes_(dtypes),
+            shapes_(shapes),
+            input_impl_(std::move(input_impl)) {}
+
+      void MaybeWaitForNotificationLocked(mutex_lock* lock)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // This essentially just releases the lock and immediately relocks.
+        cv_.wait_for(*lock, std::chrono::milliseconds(0));
+      }
+
+      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
+
+      Status GetNext(IteratorContext* ctx, const int64 batch_size,
+                     std::vector<std::vector<Tensor>>* batch,
+                     bool* end_of_sequence) {
+        mutex_lock lock(mu_);
+
+        *end_of_sequence = false;
+
+        for (int64 b = 0; b < batch_size;) {
+          if (!entries_.empty()) {
+            batch->push_back(std::move(entries_.front()));
+            entries_.pop_front();
+            ++b;
+            continue;
+          } else {
+            if (input_impl_) {
+              // There's still input coming in.
+              std::vector<Tensor> tensors;
+              bool input_end;
+              TF_RETURN_IF_ERROR(
+                  input_impl_->GetNext(ctx, &tensors, &input_end));
+              if (!input_end) {
+                batch->push_back(std::move(tensors));
+                ++b;
+                continue;
+              } else {
+                input_impl_.reset();
+              }
+            }
+            if (!input_impl_) {
+              // There's no more input coming in.
+              if (RefCountIsOne()) {
+                // No TensorQueueInserters in the wild.
+                if (batch->empty()) {
+                  *end_of_sequence = true;
+                }
+                break;
+              } else {
+                MaybeWaitForNotificationLocked(&lock);
+                // If there's data available, try to add entries again.
+                // Otherwise return a smaller batch and hope the next
+                // iterator request has a non-empty or unused queue_.
+                if (entries_.empty()) {
+                  break;
+                }
+              }
+            }
+          }
+        }  // for (int64 b = ... batch_size)
+        return Status::OK();
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) {
+        if (tensors.size() != dtypes_.size()) {
+          return errors::InvalidArgument(
+              "TensorQueue::Insert: mismatched number of tensors.  Queue "
+              "expects ",
+              dtypes_.size(), " tensors but tried to insert ", tensors.size());
+        }
+        for (int i = 0; i < tensors.size(); ++i) {
+          if (tensors[i].dtype() != dtypes_[i]) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched dtypes at component ", i,
+                ".  Attempted "
+                "to insert tensor of type ",
+                DataTypeString(tensors[i].dtype()),
+                " but queue expected type: ", DataTypeString(dtypes_[i]));
+          }
+          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched shapes at component ", i,
+                ".  Attempted "
+                "to insert tensor with shape ",
+                tensors[i].shape().DebugString(),
+                " but queue expected shape: ", shapes_[i].DebugString());
+          }
+        }
+        mutex_lock lock(mu_);
+        entries_.push_back(tensors);
+        NotifyLocked();
+        return Status::OK();
+      }
+
+      Status Save(Iterator* iter, IteratorStateWriter* writer) {
+        mutex_lock lock(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(iter->SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
+                                               entries_.size()));
+        for (int64 b = 0; b < entries_.size(); ++b) {
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
+                                                    "[", b, "][", i, "]"),
+                                    entries_[b][i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status Restore(Iterator* iter, IteratorContext* ctx,
+                     IteratorStateReader* reader) {
+        mutex_lock l(mu_);
+        if (reader->Contains(iter->full_name("input_exhausted"))) {
+          input_impl_.reset();
+        } else {
+          input_impl_ = iter->dataset_input()->MakeIterator(iter->prefix());
+          TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_));
+        }
+        entries_.clear();
+        int64 entries_size = -1;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
+        if (entries_size < 0) {
+          return errors::DataLoss(
+              "Expected entries_size key '", iter->full_name("entries_size"),
+              "' to have nonnegative value, but saw: ", entries_size);
+        }
+        for (int64 b = 0; b < entries_size; ++b) {
+          std::vector<Tensor> entry;
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            Tensor value;
+            TF_RETURN_IF_ERROR(
+                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
+                                                   "[", b, "][", i, "]"),
+                                   &value));
+            entry.push_back(std::move(value));
+          }
+          entries_.push_back(std::move(entry));
+        }
+        return Status::OK();
+      }
+
+      mutex* mu() { return &mu_; }
+
+     private:
+      DataTypeVector dtypes_;
+      std::vector<PartialTensorShape> shapes_;
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
+      condition_variable cv_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* dataset_input() const { return dataset()->input_; }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      return queue_->Save(this, writer);
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      return queue_->Restore(this, ctx, reader);
+    }
+
+   public:
+    class TensorQueueInserter {
+     public:
+      TensorQueueInserter() : queue_(nullptr) {}
+
+      void set_queue(TensorQueue* queue) {
+        queue_ = queue;
+        queue_->Ref();
+      }
+
+      TensorQueueInserter(const TensorQueueInserter& rhs) {
+        queue_ = rhs.queue_;
+        queue_->Ref();
+      };
+
+      TensorQueueInserter(TensorQueueInserter&& rhs) {
+        queue_ = rhs.queue_;
+        rhs.queue_ = nullptr;
+      }
+
+      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
+
+      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
+      string DebugString() const { return TypeName(); }
+
+      void Encode(VariantTensorData*) const {}
+      bool Decode(const VariantTensorData&) { return false; }
+
+      ~TensorQueueInserter() {
+        if (queue_) {
+          mutex_lock lock(*queue_->mu());
+          queue_->Unref();
+          queue_->NotifyLocked();
+          queue_ = nullptr;
+        }
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) const {
+        CHECK(queue_);
+        return queue_->Insert(tensors);
+      }
+
+     private:
+      mutable TensorQueue* queue_;
+    };
+
+   private:
+    TensorQueue* const queue_;
+  };
+
+ private:
+  const int64 batch_size_;
+  const DatasetBase* input_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+  const std::vector<Tensor> padding_values_;
+  const DataTypeVector dtypes_with_queue_;
+  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
+};
+
+class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    OpInputList padded_shape_tensors;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padded_shapes", &padded_shape_tensors));
+    std::vector<PartialTensorShape> padded_shapes;
+    padded_shapes.reserve(padded_shape_tensors.size());
+    OP_REQUIRES(ctx,
+                padded_shape_tensors.size() == input->output_shapes().size(),
+                errors::InvalidArgument("Number of padded shapes (",
+                                        padded_shape_tensors.size(),
+                                        ") must match the number of components "
+                                        "in the input dataset's elements (",
+                                        input->output_shapes().size(), ")"));
+    for (const Tensor& padded_shape_t : padded_shape_tensors) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
+                  errors::InvalidArgument("All padded shapes must be vectors"));
+      PartialTensorShape padded_shape;
+      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
+                              padded_shape_t.vec<int64>().data(),
+                              padded_shape_t.NumElements(), &padded_shape));
+      padded_shapes.push_back(std::move(padded_shape));
+    }
+
+    OP_REQUIRES(
+        ctx, input->output_dtypes() == output_types_,
+        errors::InvalidArgument("Input dataset and this dataset "
+                                "have different output_types: ",
+                                DataTypeVectorString(input->output_dtypes()),
+                                " and ", DataTypeVectorString(output_types_)));
+
+    for (int i = 0; i < input->output_shapes().size(); ++i) {
+      // Exclude the queue from the tensor_shapes calculation.
+      const PartialTensorShape& tensor_shape = padded_shapes[i];
+      OP_REQUIRES(
+          ctx,
+          IsGreaterEqualToOrCompatibleWith(tensor_shape,
+                                           input->output_shapes()[i]),
+          errors::InvalidArgument("Incompatible input shapes at component ", i,
+                                  " between input dataset this dataset: ",
+                                  input->output_shapes()[i].DebugString(),
+                                  " vs. ", tensor_shape.DebugString()));
+    }
+
+    OpInputList padding_values_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padding_values", &padding_values_list));
+    std::vector<Tensor> padding_values;
+    OP_REQUIRES(ctx,
+                padding_values_list.size() == input->output_shapes().size(),
+                errors::InvalidArgument(
+                    "Number of padding values (", padding_values_list.size(),
+                    ") must match the number of components in the input "
+                    "dataset's elements (",
+                    input->output_shapes().size(), ")"));
+    for (int i = 0; i < padding_values_list.size(); ++i) {
+      const Tensor& padding_value_t = padding_values_list[i];
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
+          errors::InvalidArgument(
+              "All padding values must be scalars; but at component ", i,
+              " saw shape: ", padding_value_t.shape().DebugString()));
+      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
+                  errors::InvalidArgument(
+                      "Mismatched type between padding value ", i,
+                      " and input dataset's component ", i, ": ",
+                      DataTypeString(padding_value_t.dtype()), " vs. ",
+                      DataTypeString(input->output_dtypes()[i])));
+      padding_values.push_back(padding_value_t);
+    }
+
+    *output = new PrependFromQueueAndPaddedBatchDataset(
+        ctx, batch_size, input, output_types_, padded_shapes,
+        std::move(padding_values));
+  }
+
+ private:
+  DataTypeVector output_types_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
+    PrependFromQueueAndPaddedBatchDatasetOp);
+
+class EnqueueInQueueDatasetOp : public OpKernel {
+ public:
+  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    using TensorQueueInserter =
+        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
+
+    // TODO(ebrevdo): accept list of sequence lengths to do proper
+    // sub-slicing of tensors for placement into the queue?
+    const Tensor& tensor_queue_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
+                errors::InvalidArgument("queue must be a vector, saw shape: ",
+                                        tensor_queue_t.shape().DebugString()));
+    std::vector<const TensorQueueInserter*> inserters;
+    const int64 batch_size = tensor_queue_t.NumElements();
+    inserters.reserve(batch_size);
+    const Variant* variants = tensor_queue_t.flat<Variant>().data();
+    for (int i = 0; i < batch_size; ++i) {
+      const auto* inserter = variants[i].get<TensorQueueInserter>();
+      OP_REQUIRES(ctx, inserter != nullptr,
+                  errors::InvalidArgument(
+                      "Could not access TensorQueueInserter from queue[", i,
+                      "].  Received variant: ", variants[i].DebugString()));
+      inserters.push_back(inserter);
+    }
+
+    OpInputList components;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
+    for (int i = 0; i < components.size(); ++i) {
+      OP_REQUIRES(
+          ctx,
+          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
+          errors::InvalidArgument(
+              "Expected component ", i, " to have batched shape [", batch_size,
+              ",...], but saw shape: ", components[i].shape().DebugString()));
+    }
+    std::vector<TensorShape> element_shapes;
+    for (int i = 0; i < components.size(); ++i) {
+      TensorShape element_shape = components[i].shape();
+      element_shape.RemoveDim(0);
+      element_shapes.push_back(std::move(element_shape));
+    }
+    for (int64 b = 0; b < batch_size; ++b) {
+      std::vector<Tensor> tensors;
+      tensors.reserve(components.size());
+      for (int i = 0; i < components.size(); ++i) {
+        Tensor t(components[i].dtype(), element_shapes[i]);
+        OP_REQUIRES_OK(ctx,
+                       batch_util::CopySliceToElement(components[i], &t, b));
+        tensors.push_back(std::move(t));
+      }
+      // TODO(ebrevdo): Acquire the lock once for all inserters with
+      // the same underlying queue?  Add InsertLocked?
+      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
+                        EnqueueInQueueDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 18adae1ea32316ffd995a95fb25198309fda3361..d5be4c778074e406122dc3a1a9c23681fca491d0 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -117,7 +117,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
           out_tensors->reserve(dataset()->tensors_.size());
           for (int i = 0; i < dataset()->tensors_.size(); ++i) {
             const Tensor& t = dataset()->tensors_[i];
-            Tensor t_slice(cpu_allocator(), t.dtype(),
+            Tensor t_slice(ctx->allocator({}), t.dtype(),
                            TensorShape(dataset()->shapes_[i].dim_sizes()));
             TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_));
             out_tensors->emplace_back(std::move(t_slice));
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index 965a60c7e05297d7aa7125bfcb7eed062af7a058..1b94ea05440516ff458c1785edd27589d18ffe61 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                             .HostMemory("input")
                             .HostMemory("output"),
                         CopyOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
@@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                             .HostMemory("input")
                             .HostMemory("output"),
                         DebugIdentityOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
@@ -98,7 +98,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
                           DebugNanCountOp<type>);
 REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
 #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type)        \
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 381add3fb3bd57ebf068212cdd32a640bf60dd9b..53a23b130609f8b1f4d2dd9f7665d02154f47364 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -21,7 +21,7 @@ limitations under the License.
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -91,7 +91,7 @@ class CopyOp : public OpKernel {
       Device* device = static_cast<Device*>(context->device());
       // Determine if the input tensor is not on CPU (e.g., on GPU).
       const bool off_host_input = device->device_type() == DEVICE_SYCL &&
-                            !context->input_alloc_attr(0).on_host();
+                                  !context->input_alloc_attr(0).on_host();
 
       if (off_host_input) {
         SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index c778278e8fbbec67a0255ea7d257c19da4f3612f..b7d120a617849b2c1a48b38b959f9941eb8503ac 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -39,6 +39,13 @@ class DecodeBmpOp : public OpKernel {
         errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
                                 channels_));
   }
+  inline int32 ByteSwapInt32ForBigEndian(int32 x) {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return le32toh(x);
+#else
+    return x;
+#endif
+  }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& contents = context->input(0);
@@ -56,14 +63,18 @@ class DecodeBmpOp : public OpKernel {
                                         input.size(), " bytes"));
 
     const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
-    const int32 header_size = internal::SubtleMustCopy(
+    int32 header_size_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 10)));
-    const int32 width = internal::SubtleMustCopy(
+    const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
+    int32 width_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 18)));
-    const int32 height = internal::SubtleMustCopy(
+    const int32 width = ByteSwapInt32ForBigEndian(width_);
+    int32 height_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 22)));
-    const int32 bpp = internal::SubtleMustCopy(
+    const int32 height = ByteSwapInt32ForBigEndian(height_);
+    int32 bpp_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 28)));
+    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
 
     if (channels_) {
       OP_REQUIRES(context, (channels_ == bpp / 8),
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index c4555db453ba1549601cbf9a4bbf096fc3db22b2..0c42f632521dd86760e791626c8978c0b1e82709 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int32 value;
               OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int32: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int32: ", fields[f]));
               output[f]->flat<int32>()(i) = value;
             }
             break;
@@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int64 value;
               OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int64: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int64: ", fields[f]));
               output[f]->flat<int64>()(i) = value;
             }
             break;
@@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               float value;
               OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid float: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid float: ", fields[f]));
               output[f]->flat<float>()(i) = value;
             }
             break;
@@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               double value;
               OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid double: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid double: ", fields[f]));
               output[f]->flat<double>()(i) = value;
             }
             break;
@@ -208,9 +208,10 @@ class DecodeCSVOp : public OpKernel {
         if (!quoted) {
           while (static_cast<size_t>(current_idx) < input.size() &&
                  input[current_idx] != delim_) {
-            OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') &&
-                                 input[current_idx] != '\n' &&
-                                 input[current_idx] != '\r',
+            OP_REQUIRES(ctx,
+                        (!use_quote_delim_ || input[current_idx] != '"') &&
+                            input[current_idx] != '\n' &&
+                            input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
             field += input[current_idx];
@@ -238,10 +239,11 @@ class DecodeCSVOp : public OpKernel {
           }
 
           OP_REQUIRES(
-              ctx, (static_cast<size_t>(current_idx) < input.size() &&
-                    input[current_idx] == '"' &&
-                    (static_cast<size_t>(current_idx) == input.size() - 1 ||
-                     input[current_idx + 1] == delim_)),
+              ctx,
+              (static_cast<size_t>(current_idx) < input.size() &&
+               input[current_idx] == '"' &&
+               (static_cast<size_t>(current_idx) == input.size() - 1 ||
+                input[current_idx + 1] == delim_)),
               errors::InvalidArgument("Quoted field has to end with quote "
                                       "followed by delim or end"));
 
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 44dcbf834ce838e3b25957f88bfcded645104957..912d04c1536600348e8263f03709f2305607d11f 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel {
       channels_ = 3;
     } else {
       OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                               channels_ == 4,
-                  errors::InvalidArgument(
-                      "channels must be 0, 1, 3, or 4, got ", channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                  channels_));
     }
     flags_.components = channels_;
 
@@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel {
 
     if (format_ == kJpgFormat) {
       OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
-                               flags_.ratio == 4 || flags_.ratio == 8,
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
                   errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
                                           flags_.ratio));
       OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
@@ -130,8 +132,9 @@ class DecodeImageOp : public OpKernel {
       string dct_method;
       OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
       OP_REQUIRES(
-          context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-                    dct_method == "INTEGER_ACCURATE"),
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
           errors::InvalidArgument("dct_method must be one of "
                                   "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
       if (dct_method == "INTEGER_FAST") {
@@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel {
         errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
                                 FileFormatString(magic, input)));
     OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(FileFormatString(magic, input),
-                                        " contents are too large for int: ",
-                                        input.size()));
+                errors::InvalidArgument(
+                    FileFormatString(magic, input),
+                    " contents are too large for int: ", input.size()));
     OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
                 errors::InvalidArgument(FileFormatString(magic, input),
                                         " does not support uint16 output"));
@@ -212,9 +215,10 @@ class DecodeImageOp : public OpKernel {
             input.data(), input.size(), flags, nullptr /* nwarn */,
             [=, &output](int width, int height, int channels) -> uint8* {
               Status status(context->allocate_output(
-                  0, format_ == kGifFormat
-                         ? TensorShape({1, height, width, channels})
-                         : TensorShape({height, width, channels}),
+                  0,
+                  format_ == kGifFormat
+                      ? TensorShape({1, height, width, channels})
+                      : TensorShape({height, width, channels}),
                   &output));
               if (!status.ok()) {
                 VLOG(1) << status;
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 8e9b8a7e2e7be8e55deeacd4de3f77033499387f..829155fb313bd354d28432be6212af0760630c44 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -120,9 +120,9 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
 
   VLOG(2) << "CanUseDeepConv2D"
           << " deep_conv_cost: " << deep_conv_cost
-          << " direct_conv_cost: " << direct_conv_cost
-          << " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) /
-                                        static_cast<float>(direct_conv_cost))
+          << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
+          << (static_cast<float>(deep_conv_cost) /
+              static_cast<float>(direct_conv_cost))
           << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
   return deep_conv_cost < direct_conv_cost;
 }
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 6d44a92fa3c2d22ade6293d30b4f008a62eb8e0f..6497c8f3719737ede2d261decd16f01c9854a7eb 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -89,7 +89,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -113,14 +113,14 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                \
-REGISTER_KERNEL_BUILDER(                                           \
-    Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
-    AssignOpT<SYCLDevice, type>);
+#define REGISTER_SYCL_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      AssignOpT<SYCLDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -146,7 +146,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #endif  // end GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
+#define REGISTER_SYCL_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
@@ -156,5 +156,5 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 9347978d515b9244dde2b50b2fcfaa3c91ab9c94..91a9587174be4c047f8a21ea9222219def42d5f1 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -400,7 +400,7 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop input.
     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
       const int64 input_image_size =
@@ -750,7 +750,7 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop filter.
     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
       const int64 padded_out_depth_size =
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index a5fd07fbe177f2206ef9b6b3252556211b9e3905..c060b2e14d2f03f990af5267260bd88fa01a2c81 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
     // in_depth for input and filter must match.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is depth multiplier.
     const int32 depth_multiplier = filter.dim_size(3);
@@ -430,9 +430,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T"),
+                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
 
 REGISTER_KERNEL_BUILDER(
     Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 903aac5d68baeb8c37b009a54863a084dcb75147..126b64f73dff81f9abded0d3041ce4ce788d1e9c 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/util_ptx.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "external/cub_archive/cub/util_ptx.cuh"
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
@@ -34,6 +34,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+typedef Eigen::GpuDevice GPUDevice;
 using Eigen::GpuDevice;
 
 // Returns whether depthwise convolution forward or backward input pass can be
@@ -1020,7 +1021,7 @@ __global__ void __launch_bounds__(640, 2)
 
 // Device function to compute sub-warp sum reduction for a power-of-two group of
 // neighboring threads.
-template<int kWidth, typename T>
+template <int kWidth, typename T>
 __device__ __forceinline__ T WarpSumReduce(T val) {
   // support only power-of-two widths.
   assert(__popc(kWidth) == 1);
@@ -1028,7 +1029,7 @@ __device__ __forceinline__ T WarpSumReduce(T val) {
   int zeros = sub_warp * kWidth;
   unsigned mask = ((1UL << kWidth) - 1) << zeros;
   for (int delta = kWidth / 2; delta > 0; delta /= 2) {
-    val += CudaShuffleXor(mask, val, delta);
+    val += CudaShuffleXorSync(mask, val, delta);
   }
   return val;
 }
@@ -1145,7 +1146,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallot(CUDA_WARP_ALL, depth_in_range);
+    unsigned active_threads = CudaBallotSync(kCudaWarpAll, depth_in_range);
 
     if (depth_in_range) {
       const T* const out_ptr = inout_offset + output;
@@ -1159,7 +1160,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
           T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
           for (int delta = 16; delta >= kBlockSlices; delta /= 2) {
-            val += CudaShuffleDown(active_threads, val, delta);
+            val += CudaShuffleXorSync(active_threads, val, delta);
           }
           if (!(thread_idx & 32 - kBlockSlices) /* lane_idx < kBlockSlices */) {
             *accum_ptr = val;
@@ -1399,7 +1400,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallot(CUDA_WARP_ALL, slice_in_range);
+    unsigned active_threads = CudaBallotSync(kCudaWarpAll, slice_in_range);
 
     if (slice_in_range) {
       const T* const out_ptr = inout_offset + output;
@@ -1413,10 +1414,10 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
           T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
           for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
-            val += CudaShuffleDown(active_threads, val, delta);
+            val += CudaShuffleXorSync(active_threads, val, delta);
           }
           if (!(thread_idx & 32 / kBlockSlices - 1)) {
-            *accum_ptr = val;
+            *accum_ptr = val;  // kBlockSlices threads per warp.
           }
           ++shared_offset;
           accum_ptr += accum_increment;
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index 86fa7dce36afff121dc6ff0642f45c809bc63a3d..d228153d4c76dedd74a4b1db1059bc25ff0a6f77 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& diagonal = context->input(0);
     const int num_dims = diagonal.dims();
-    OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument(
-        "Input must be at least rank 1, got 0"));
+    OP_REQUIRES(
+        context, 0 != num_dims,
+        errors::InvalidArgument("Input must be at least rank 1, got 0"));
     TensorShape out_shape;
     for (int i = 0; i < num_dims; ++i) {
       out_shape.AddDim(diagonal.dim_size(i));
@@ -60,10 +61,9 @@ class DiagOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, out_shape, &output_tensor));
     functor::DiagFunctor<Device, T> diagFunc;
-    Status s = diagFunc(context,
-                        diagonal.NumElements(),
-                        diagonal.flat<T>().data(),
-                        output_tensor->flat<T>().data());
+    Status s =
+        diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
+                 output_tensor->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -82,12 +82,12 @@ class DiagPartOp : public OpKernel {
                 errors::InvalidArgument("The rank of the tensor should be \
                                          even and positive, got shape ",
                                         tensor.shape().DebugString()));
-    for (int i = 0; i < out_dims; i++){
-      OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
-                  errors::InvalidArgument(
-                    "Invalid shape ", tensor.shape().DebugString(),
-                    ": dimensions ", i, " and ", i + out_dims, " do not match.")
-                  );
+    for (int i = 0; i < out_dims; i++) {
+      OP_REQUIRES(
+          context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
+          errors::InvalidArgument("Invalid shape ",
+                                  tensor.shape().DebugString(), ": dimensions ",
+                                  i, " and ", i + out_dims, " do not match."));
     }
 
     TensorShape out_shape;
@@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, out_shape, &output));
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
     functor::DiagPartFunctor<Device, T> diagPartFunc;
-    Status s = diagPartFunc(context,
-                            out_shape.num_elements(),
-                            tensor.flat<T>().data(),
-                            output->flat<T>().data());
+    Status s = diagPartFunc(context, out_shape.num_elements(),
+                            tensor.flat<T>().data(), output->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
 namespace functor {
 template <typename T>
 struct DiagFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for writing values in index range
     // [start*size, limit*size)
     auto subDiag = [in, out, size](int64 start, int64 limit) {
@@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5 * size, subDiag);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
+          subDiag);
     return Status::OK();
   }
 };
 
 template <typename T>
 struct DiagPartFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for extracting values in index range
     // [start, limit)
     auto subDiagPart = [in, out, size](int64 start, int64 limit) {
@@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5, subDiagPart);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
+          subDiagPart);
     return Status::OK();
   }
 };
 }  // namespace functor
 
-
 // Register the CPU kernels.
 #define REGISTER_DIAGOP(T)                                    \
   REGISTER_KERNEL_BUILDER(                                    \
@@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
 
 #endif  // GOOGLE_CUDA
 
-
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/diag_op.h b/tensorflow/core/kernels/diag_op.h
index c6ca6a2047455649b5197da27a58cb068476e928..baf16ddb4b987fa09de113c0316ec0014c884980 100644
--- a/tensorflow/core/kernels/diag_op.h
+++ b/tensorflow/core/kernels/diag_op.h
@@ -26,14 +26,14 @@ namespace functor {
 
 template <typename Device, typename T>
 struct DiagFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 template <typename Device, typename T>
 struct DiagPartFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index d3c529d784e3a9ba4a793cd98cff9eb5e74d6090..910f3093b2307526e36bdfad9ac6746dd861d4fd 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <complex>
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/kernels/diag_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 namespace functor {
@@ -28,10 +28,8 @@ namespace functor {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
-__global__ void DiagCudaKernel(const int num_threads,
-                               const int64 size,
-                               const T* in,
-                               T* out) {
+__global__ void DiagCudaKernel(const int num_threads, const int64 size,
+                               const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
@@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
     // so this may overflow for `size*size` in extreme cases,
     // here is checking the multiplication overflow for integer.
     if (size && (int(size * size) / size) != size) {
-      return errors::Internal(
-          "DiagOp got input size too large.");
+      return errors::Internal("DiagOp got input size too large.");
     }
     int virtual_thread_count = int(size * size);
 
     // Launch the GPU kernel.
     const GPUDevice& device = context->eigen_device<GPUDevice>();
-    CudaLaunchConfig diag_config = GetCudaLaunchConfig(
-        virtual_thread_count, device);
-    DiagCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    CudaLaunchConfig diag_config =
+        GetCudaLaunchConfig(virtual_thread_count, device);
+    DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
+                     device.stream()>>>(diag_config.virtual_thread_count, size,
+                                        in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
@@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
 template struct DiagFunctor<GPUDevice, complex64>;
 template struct DiagFunctor<GPUDevice, complex128>;
 
-
 template <typename T>
-__global__ void DiagPartCudaKernel(const int num_threads,
-                                   const int64 size,
-                                   const T* in,
-                                   T* out) {
+__global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
+                                   const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     out[index] = in[(1 + size) * index];
   }
@@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagPartFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
 
     // Extract the diagonal elements.
     CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
-    DiagPartCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
+                         0, device.stream()>>>(diag_config.virtual_thread_count,
+                                               size, in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagPartOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc
index 2d1417854cc06a138a803169495196ac70e70e5d..a708e53dd016d9a004a0cd2ddcdc285b0e6ad6fd 100644
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@@ -30,8 +30,8 @@ static Graph* Diag(int n, DataType type) {
   return g;
 }
 
-#define BM_DiagDev(N, T, TFTYPE, DEVICE)                           \
-  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {   \
+#define BM_DiagDev(N, T, TFTYPE, DEVICE)                        \
+  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {  \
     testing::UseRealTime();                                     \
     testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
     test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters);    \
@@ -51,4 +51,3 @@ BM_Diag(128);
 BM_Diag(512);
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc
index 6f5c0e91569eb5d44069a452632ad108e5df7d0d..441a63465c8246e09a8e70535f4b95a94d7acdb3 100644
--- a/tensorflow/core/kernels/dilation_ops.cc
+++ b/tensorflow/core/kernels/dilation_ops.cc
@@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
                                       filter.shape().DebugString()));
   const int filter_rows = filter.dim_size(0);
   const int filter_cols = filter.dim_size(1);
-  OP_REQUIRES(
-      context, depth == filter.dim_size(2),
-      errors::InvalidArgument("input and filter must have the same depth: ",
-                              depth, " vs ", filter.dim_size(2)));
+  OP_REQUIRES(context, depth == filter.dim_size(2),
+              errors::InvalidArgument(
+                  "input and filter must have the same depth: ", depth, " vs ",
+                  filter.dim_size(2)));
 
   // Effective filter size, after introducing rate - 1 zeros between each
   // non-zero filter element.
@@ -234,10 +234,11 @@ class DilationBackpropInputOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed in_backprop has the same dimensions as the input:
@@ -353,10 +354,11 @@ class DilationBackpropFilterOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed filter_backprop has the same dimensions as the filter:
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index ac0775fbefe601e53aaa6c67529cf9a67a0562c2..c63806a7f68c6981dd0e83373c6bfd598788e338 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index a8818b7385d9d5253588ec40f425b85180c79006..b5d5b880bbbacab07c51fc395b86b4fbbb343d36 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -29,8 +29,7 @@ template <class T>
 class DrawBoundingBoxesOp : public OpKernel {
  public:
   explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& images = context->input(0);
@@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
         int64 color_index = bb % color_table_length;
         const int64 min_box_row =
             static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp =
-            std::max<int64>(min_box_row, 0);
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
         const int64 max_box_row =
             static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
         const int64 max_box_row_clamp =
             std::min<int64>(max_box_row, height - 1);
         const int64 min_box_col =
             static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp =
-            std::max<int64>(min_box_col, 0);
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
         const int64 max_box_col =
             static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
-        const int64 max_box_col_clamp =
-            std::min<int64>(max_box_col, width - 1);
+        const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
 
         if (min_box_row > max_box_row || min_box_col > max_box_col) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is inverted and will not be drawn.";
           continue;
         }
-        if (min_box_row >= height || max_box_row < 0 ||
-            min_box_col >= width || max_box_col < 0) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+        if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
+            max_box_col < 0) {
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is completely outside the image"
                        << " and will not be drawn.";
           continue;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 861e16b2fd02001e913f548a5b48ca6b7497a8f2..3c988db5e618b976b5b2d45a9bfc386485249826 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       // Walk through data and copy the data to the appropriate output tensor
       const auto data_flat = data->flat<T>();
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_vec;
+                                   Eigen::Aligned> >
+          out_vec;
       out_vec.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_vec.push_back(outputs[p]->vec<T>());
@@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
     } else {
       // If data has extra dimensions, use Eigen slices
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_flat;
+                                   Eigen::Aligned> >
+          out_flat;
       out_flat.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_flat.push_back(outputs[p]->flat_outer_dims<T>());
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 9bb58b13f382970c60b551f448243a2b75e30df3..9dfeccff0e8d2488fec5a1dc7b93f83d2cfedca5 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -79,9 +79,9 @@ template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
   CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      start, delta, size, out.data());
+  RangeInitKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          start, delta, size, out.data());
 }
 
 // Given *num_runs pairs (key, value), this function moves the value
@@ -103,11 +103,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
   CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<
-      T, int32,
-      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      params, indices, out, gather_dim_size, indices_size, slice_size,
-      out_size);
+  GatherOpKernel<T, int32, true>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          params, indices, out, gather_dim_size, indices_size, slice_size,
+          out_size);
 }
 
 struct IdentityOp {
@@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     OP_REQUIRES_ASYNC(
         c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
-        errors::InvalidArgument("data.shape must start with partitions.shape, ",
-                                "got data.shape = ", data.shape().DebugString(),
-                                ", partitions.shape = ",
-                                partitions.shape().DebugString()),
+        errors::InvalidArgument(
+            "data.shape must start with partitions.shape, ",
+            "got data.shape = ", data.shape().DebugString(),
+            ", partitions.shape = ", partitions.shape().DebugString()),
         done);
 
     Tensor partition_count;
@@ -245,8 +244,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
       AllocatorAttributes alloc_attr;
       alloc_attr.set_on_host(true);
       OP_REQUIRES_OK_ASYNC(
-          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                              &partition_count, alloc_attr),
+          c,
+          c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                           &partition_count, alloc_attr),
           done);
       auto e_part_count = partition_count.flat<int32>();
       for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0;
@@ -259,8 +259,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     // Prepare for counting.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &partition_count),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &partition_count),
         done);
     Tensor indices_out;
     // Count how many times each partition index occurs.
@@ -280,8 +281,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     alloc_attr.set_on_host(true);
     alloc_attr.set_gpu_compatible(true);
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
-                            &cpu_tensor, alloc_attr),
+        c,
+        c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+                         &cpu_tensor, alloc_attr),
         done);
     perftools::gputools::DeviceMemoryBase wrapped(
         partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
@@ -340,9 +342,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Radix-sort the partition information.
     cub::DeviceRadixSort::SortPairs(
@@ -376,8 +379,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     zero_functor(device, partition_count->flat<int32>());
     // Allocate memory for aggregates_out.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &aggregates_out),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &aggregates_out),
         done);
     // Obtain the pointers to inner buffers.
     int32* keys_in_ptr = partitions_out.flat<int32>().data();
@@ -408,9 +412,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                                    num_runs_ptr, reduction_op, N, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Run reduce-by-key. The effect is that we count how many times
     // each index appears in partitions. The distinct indices are stored
diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h
index 99b4b2abe66d9f372f99af1ef6164774e7ebfabc..302033e47c59db2d87483a8e2f1e70d0572b21f9 100644
--- a/tensorflow/core/kernels/eigen_activations.h
+++ b/tensorflow/core/kernels/eigen_activations.h
@@ -21,13 +21,13 @@ limitations under the License.
 namespace Eigen {
 
 /** scalar_sigmoid_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a sigmoid
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a sigmoid
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+ */
 template <typename T>
 struct scalar_sigmoid_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
@@ -55,13 +55,13 @@ struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
 }  // namespace internal
 
 /** scalar_tanh_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a tanh
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a tanh
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+ */
 template <typename T>
 struct scalar_tanh_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
@@ -89,11 +89,11 @@ struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
 }  // namespace internal
 
 /**
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to clip the magnitude of the first scalar.
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::Clip
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
 template <typename Scalar>
 struct scalar_clip_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc
index 907233103d8244749410c3198f0ca92ad44769b8..34952f5abb8526f0317ba8a674948fada4dc0ce7 100644
--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index 3a94b8c9933ddbf262552044c73206e1deb9828d..4d86f9deb9902a64764e29ca0371bb68ad4f3370 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -21,35 +21,47 @@ limitations under the License.
 namespace Eigen {
 
 /** ExtractGlimpses
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Extract glimpses from an input tensor.
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
-  * The width and height parameters specify the extension of the returned glimpses.
-  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
-  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
-  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
-  *
-  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
-  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Extract glimpses from an input tensor.
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 4
+ * (depth, x, y, and batch). The width and height parameters specify the
+ * extension of the returned glimpses. The offsets parameter specifies the x, y
+ * locations of the center of the glimpses relative to the center of the input
+ * image. The vector is expected to contain one IndexPair for each image in the
+ * batch dimension. The normalized boolean indicates if incoming coordinates are
+ * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
+ * height and width dimension. The centered boolean indicates if incoming
+ * coordinates are centered relative to the image, in which case -1.0 and 1.0
+ * correspond to minimum and maximum of each dimension while 0.0 corresponds to
+ * the center.
+ *
+ * The result can be assigned to a tensor of rank equal to that of the input.
+ * The result will be laid out in col-major order (depth, x, y, batch). The
+ * dimensions of the result will be equal to the dimensions of the input except
+ * for width and height which will be equal to the requested glimpse size.
+ */
 namespace {
 template <typename Index>
 struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
-                      const bool normalized,
-                      const bool centered,
-                      const bool uniform_noise) :
-      width_(width), height_(height), offsets_(offsets),
-      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+                      const bool normalized, const bool centered,
+                      const bool uniform_noise)
+      : width_(width),
+        height_(height),
+        offsets_(offsets),
+        normalized_(normalized),
+        centered_(centered),
+        uniform_noise_(uniform_noise) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
 
     DSizes<Index, 4> dims = in.dimensions();
@@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  EIGEN_DEVICE_FUNC
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
+                              const Device& device) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
     const Index num_channels = in.dimension(0);
     const Index input_width = in.dimension(1);
@@ -97,8 +109,8 @@ struct GlimpseExtractionOp {
       x -= width_ / 2.0f;
       y -= height_ / 2.0f;
 
-      const Index offset_x = (Index) x;
-      const Index offset_y = (Index) y;
+      const Index offset_x = (Index)x;
+      const Index offset_y = (Index)y;
       Index glimpse_width = width_;
       Index glimpse_height = height_;
       bool partial_overlap = false;
@@ -135,7 +147,7 @@ struct GlimpseExtractionOp {
         if (uniform_noise_) {
           // Initialize the glimpse with uniform noise.
           typedef typename internal::remove_const<
-            typename internal::traits<Input>::Scalar>::type Scalar;
+              typename internal::traits<Input>::Scalar>::type Scalar;
           TensorFixedSize<Scalar, Sizes<> > mini;
           mini.device(device) = input.template chip<3>(i).minimum();
           TensorFixedSize<float, Sizes<> > range;
@@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
   const bool centered_;
   const bool uniform_noise_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
+    const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
+    const Input>
 ExtractGlimpses(const Input& input,
                 const typename internal::traits<Input>::Index width,
                 const typename internal::traits<Input>::Index height,
                 const std::vector<IndexPair<float> >& offsets,
                 const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+                const bool uniform_noise = true) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
@@ -237,6 +250,6 @@ ExtractGlimpses(const Input& input,
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc
index 3a2eeb05959e8844903eb3b910a893760bb02e74..08f61877182cce36316752b7dd17dee3bd2efaac 100644
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenAttentionTest, Simple) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index aec76978102ed4d5e8d0cca18f1ae4422acc1515..099696105b61c19b7fcc9694fe1d7a3021cb97dc 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -21,29 +21,29 @@ limitations under the License.
 namespace Eigen {
 
 /** SpatialConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_in_stride > 1, then applies convolution with holes
+ *
+ * If row_in_stride, col_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 #ifdef EIGEN_HAS_INDEX_LIST
 typedef IndexList<type2index<0>, type2index<0>, type2index<1>, type2index<1> >
     ReverseColMajor;
@@ -293,29 +293,29 @@ SpatialConvolutionBackwardInput(
 }
 
 /** SpatialConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
+ *
+ * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
  * atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 
 template <typename OutputBackward, typename Input>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
index 1758067829e5b577477c1d86f9cdb4396b46e047..2229ec9659472daee3158c593252907f288d829f 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest,
      test_simple_spatial_convolution_backward_input_valid) {
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 972036833fff6753031e97216d524a014bb81cbb..896c9957616037da4ead2dbda8cb2393eaea226f 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -309,10 +309,10 @@ struct AvgPoolMeanReducer {
   _mm512_castsi512_ps( \
       _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
 
-// The ternarylogic function immediate determines the values in the result
-// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
-// For details, refer to the vpternlogd instruction table at
-// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+  // The ternarylogic function immediate determines the values in the result
+  // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+  // For details, refer to the vpternlogd instruction table at
+  // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
 
 #define psel(a, b, false_mask)                        \
   _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc
index 9383972b9fff39deb130d5cecac6f0c7abec5566..47b6665e680268793df18d50395d0b6c6aca0ad0 100644
--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenPoolingTest, Simple) {
   const int depth = 10;
diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h
index a2930a726f908ac4862a47104e379e6d30e88477..12148c54b364bbc5ef1dff9b9645303534e7ea12 100644
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ b/tensorflow/core/kernels/eigen_softmax.h
@@ -21,19 +21,21 @@ limitations under the License.
 namespace Eigen {
 
 /** SoftMax
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a softmax
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
-  *
-  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
-  *
-*/
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a softmax
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 2
+ * (depth and other).
+ *
+ * The result can be assigned to a tensor of rank and dimensions equal to that
+ * of the input. The result will be laid out in col-major order.
+ *
+ */
 
 namespace {
 struct SoftmaxOp {
-  SoftmaxOp(const float beta) : beta_(beta) { }
+  SoftmaxOp(const float beta) : beta_(beta) {}
 
   template <typename Input>
   typename Input::Dimensions dimensions(const Input& input) const {
@@ -41,8 +43,7 @@ struct SoftmaxOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  void eval(const Input& input, Output& output, const Device& device) const {
 #if !defined(EIGEN_HAS_INDEX_LIST)
     // nvcc doesn't support cxx11
     Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
@@ -56,35 +57,43 @@ struct SoftmaxOp {
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
-    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
+    Eigen::IndexList<Eigen::type2index<0> > depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1> > bcast;
     bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    Eigen::IndexList<Eigen::type2index<1>,
+                     typename internal::traits<Input>::Index>
+        dims2d;
     dims2d.set(1, dimensions(input)[1]);
 #endif
 
-    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
-    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+    output.device(device) =
+        ((input -
+          input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
+         beta_)
+            .exp();
+    output.device(device) =
+        output /
+        (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
   }
 
  private:
   const float beta_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
-SoftMax(const Input& input, const float beta)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
+                                                     const Input>
+SoftMax(const Input& input, const float beta) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   const SoftmaxOp op(beta);
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc
index ba681d68ab0d416cd2c7bae9065df9b95638a3e8..7f985d71366487e0426e25e064764c196979b114 100644
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenSoftmaxTest, Simple) {
   const int depth = 1024;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 2fe64cd72ac06e86cccea31145079451d0b28f88..1acbe3a658070222e99ff874815db9a6b07d4565 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -877,29 +877,29 @@ struct gemm_pack_rhs<
 }  // end namespace internal
 
 /** SpatialConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 2D convolution over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 3 or more
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
  * (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The input and the kernel must both be in col-major layout. The result will
+ * The input and the kernel must both be in col-major layout. The result will
  * also be in col-major layout.
-  *
-  * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be filters, height, width (and
  * others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 template <typename Input, typename Kernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
@@ -993,7 +993,7 @@ EIGEN_DEVICE_FUNC
     default:
       // Initialize unused variables to avoid a compiler warning
       out_height = 0;
-      out_width  = 0;
+      out_width = 0;
       eigen_assert(false && "unexpected padding");
   }
 
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 4fcae25aa6eac8b31f78e1d5ae964aed427fc0f4..1a5b0f2b675a85ba2c1dbf0356c3e42b03db22b4 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -80,10 +80,11 @@ class EncodeJpegOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(context, FastBoundsCheck(image.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument(
-                    "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument(
+            "Cannot encode images with >= max int32 elements"));
 
     const int32 dim_size0 = static_cast<int32>(image.dim_size(0));
     const int32 dim_size1 = static_cast<int32>(image.dim_size(1));
@@ -100,9 +101,10 @@ class EncodeJpegOp : public OpKernel {
       } else if (channels == 3) {
         adjusted_flags.format = jpeg::FORMAT_RGB;
       } else {
-        OP_REQUIRES(context, false, errors::InvalidArgument(
-                                        "image must have 1 or 3 channels, got ",
-                                        image.shape().DebugString()));
+        OP_REQUIRES(
+            context, false,
+            errors::InvalidArgument("image must have 1 or 3 channels, got ",
+                                    image.shape().DebugString()));
       }
     } else {
       if (flags_.format == jpeg::FORMAT_GRAYSCALE) {
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 268a059275acc62432d59df239abd5869f546064..83cd0e9b47e5480cd562452213aa81c7a4a64a95 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -346,8 +346,9 @@ class SingleSequenceExampleParserOp : public OpKernel {
           feature_list_sparse_keys[di].scalar<string>()();
     }
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(
-                 feature_list_dense_missing_assumed_empty->shape()),
+        ctx,
+        TensorShapeUtils::IsVector(
+            feature_list_dense_missing_assumed_empty->shape()),
         errors::InvalidArgument(
             "Expected feature_list_dense_missing_assumed_empty ",
             "to be a vector, got shape: ",
@@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
       required[d] = (def_value.NumElements() == 0);  // No default provided.
 
       if (def_value.NumElements() > 0) {
-        OP_REQUIRES(
-            ctx, def_value.shape() == attrs_.context_dense_shapes[d],
-            errors::InvalidArgument(
-                "def_value[", d, "].shape() == ",
-                def_value.shape().DebugString(), " != context_dense_shapes_[",
-                d, "] == ", attrs_.context_dense_shapes[d].DebugString()));
+        OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
+                    errors::InvalidArgument(
+                        "def_value[", d,
+                        "].shape() == ", def_value.shape().DebugString(),
+                        " != context_dense_shapes_[", d,
+                        "] == ", attrs_.context_dense_shapes[d].DebugString()));
         OP_REQUIRES(
             ctx, def_value.dtype() == attrs_.context_dense_types[d],
             errors::InvalidArgument(
@@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
         const Feature& f = fl.feature(t);
         bool types_match;
         OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
-        OP_REQUIRES(
-            ctx, types_match,
-            errors::InvalidArgument(
-                "Name: ", name, ", Feature list: ", key, ", Index: ", t,
-                ".  Data types don't match. ", "Expected type: ",
-                DataTypeString(dtype), "  Feature is: ", ProtoDebugString(f)));
+        OP_REQUIRES(ctx, types_match,
+                    errors::InvalidArgument(
+                        "Name: ", name, ", Feature list: ", key, ", Index: ", t,
+                        ".  Data types don't match. ",
+                        "Expected type: ", DataTypeString(dtype),
+                        "  Feature is: ", ProtoDebugString(f)));
         OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
                                              feature_list_dense_values[d]));
       }
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index 4fbf76d2d0d0470c0529353003eb7e086451d57f..4a1aa433bc94e5f190ce75c1b991eaf91210eedf 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -122,13 +122,9 @@ static string D(const char* s) {
   return ret;
 }
 
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("Yoxmos").c_str()),
-                        FactOpKernel2);
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("yoxmos").c_str()),
-                        FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 5953db14768fd4e8d6c8537a2bea91c2ca211b17..af3a42135d1fe99da87c1cfafbc2b8eb932a7d2c 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -378,9 +378,8 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
   Tensor* output = GetOutput(0);
   auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, input_flat(1), input_flat(2),
-                     input_flat(3), input_flat(4), 0.0f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
@@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2270,14 +2265,13 @@ TEST_F(QuantOpsTest,
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
                                   TensorShape({1, 2, 3, 4}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(
-      &expected_bprop_wrt_input,
-      {0.0f, grad_flat(1), grad_flat(2), 0.0f,
-       0.0f, grad_flat(5), grad_flat(6), 0.0f,
-       0.0f, grad_flat(9), grad_flat(10), 0.0f,
-       0.0f, grad_flat(13), grad_flat(14), 0.0f,
-       0.0f, grad_flat(17), grad_flat(18), 0.0f,
-       0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 82ec87911985abe714490ad74fa19105f850b536..479f7be4b506e4f8721216fb00ea0eff7e0394c2 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -255,97 +255,96 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int64 queue_size = queues_[0].size();
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int64 queue_size = queues_[0].size();
 
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the front of the
-                    // queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to FIFOQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_front(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "FIFOQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the front of the
+                // queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to FIFOQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_front(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "FIFOQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int64 index = attempt->tuple[0].dim_size(0) -
-                                      attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int64 index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index bde39770dee0a3e66746bb47562f799ab8bb1224..7090417dfdb2d7e433025b1a0f1cdeb5eece10a8 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 
@@ -60,7 +60,7 @@ DEFINE_SETZERO_CPU(Variant);
 template <typename T>
 void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
     const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-      To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  To32Bit(out).device(d) = To32Bit(out).constant(T(0));
 }
 
 #define DEFINE_SETZERO_SYCL(T) \
@@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double);
 
 template <typename T>
 struct FillFunctor<Eigen::ThreadPoolDevice, T> {
-  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out,
                   typename TTypes<T>::ConstScalar in) {
     out.device(d) = out.constant(in());
   }
@@ -150,8 +151,7 @@ struct FillFunctor<Eigen::SyclDevice, T> {
   }
 };
 
-#define DEFINE_FILL_SYCL(T) \
-  template struct FillFunctor<Eigen::SyclDevice, T>;
+#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
 DEFINE_FILL_SYCL(float);
 DEFINE_FILL_SYCL(double);
 TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 47f4189c30f10644ca7b040677ebadf439a9dc75..135d0023458b1ef393ab0bc296dc07310347e7ff 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -232,8 +232,9 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Grab the inputs.
     const Tensor& orig_input_tensor_shape = context->input(0);
-    OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 &&
-                             orig_input_tensor_shape.NumElements() == 4,
+    OP_REQUIRES(context,
+                orig_input_tensor_shape.dims() == 1 &&
+                    orig_input_tensor_shape.NumElements() == 4,
                 errors::InvalidArgument("original input tensor shape must be"
                                         "1-dimensional and 4 elements"));
     const Tensor& out_backprop = context->input(1);
diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h
index df0bbbfa066bca4705ff371d1823f789a1c4e9ef..2d7a230fc00613d91d147d4927403ba270a4d562 100644
--- a/tensorflow/core/kernels/fractional_pool_common.h
+++ b/tensorflow/core/kernels/fractional_pool_common.h
@@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) {
 //     * sum(generated_diff_pooling_sequence) = input_length
 //     * Let's define floor(input_length / output_length) = K, then
 //       K <= generated_diff_pooling_sequence[i] <= K+1
-// For example, when input_length = 10, output_length = 6, the followings are
+// For example, when input_length = 10, output_length = 6, the following are
 // valid pooling sequence:
 //     * [1, 2, 2, 1, 2, 2]
 //     * [1, 1, 2, 2, 2, 2]
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index ef9e8484132d25e517367862364518ca0baf38af..9d4bc35ba890c251b0800f266e7845e411e7a835 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -253,22 +253,21 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    lib->Run(
-        opts, handle, args, rets, [ctx, done, rets](const Status& status) {
-          if (!status.ok()) {
-            ctx->SetStatus(status);
-          } else if (rets->size() != ctx->num_outputs()) {
-            ctx->SetStatus(errors::InvalidArgument(
-                "SymGrad expects to return ", ctx->num_outputs(),
-                " tensor(s), but get ", rets->size(), " tensor(s) instead."));
-          } else {
-            for (size_t i = 0; i < rets->size(); ++i) {
-              ctx->set_output(i, (*rets)[i]);
-            }
-          }
-          delete rets;
-          done();
-        });
+    lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+      } else if (rets->size() != ctx->num_outputs()) {
+        ctx->SetStatus(errors::InvalidArgument(
+            "SymGrad expects to return ", ctx->num_outputs(),
+            " tensor(s), but get ", rets->size(), " tensor(s) instead."));
+      } else {
+        for (size_t i = 0; i < rets->size(); ++i) {
+          ctx->set_output(i, (*rets)[i]);
+        }
+      }
+      delete rets;
+      done();
+    });
   }
 
  private:
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index a8484390b928105cb51216e18e419957f12ad2ac..4a67b2b3a30463448ac97aff96402f6500eeb19a 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -68,7 +68,8 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
 template <class T>
 void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
                                   typename TTypes<T>::Flat out) {
-  To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+  To32Bit(out).device(d) =
+      To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
 }
 
 template class VarianceToInvVariance<float>;
diff --git a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
index 6d4a9dfdef4609a45d3a38e49a32492408043617..37edd1ce0f95d7f6d6a366f5b0d83bac7f6159d5 100644
--- a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
index b084a972049cc2b1997df64a2f43a6d79b6b4e6d..f3b24b2341e590adfbeac1a18b6a65fbfd34f598 100644
--- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
index 9dd795b94e82c48ad037df67f3218ed62feb722e..e9ffad178616a7b0872d461653cb01c40b292d88 100644
--- a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
index 4a68a5b5803f363ab93bf280df54fa8f14206a84..020f18b1895c480748cafbfb8f7f267887db1fba 100644
--- a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
index 2d6c82826cf9dad1ca67d6e5ee1d13a059f9c8ea..a8f07f4bad3a7e7ccff4ebefd4c56c695d0b2573 100644
--- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
index 81b6e491248fda37f602c0365c1e90d4b08f7c2a..f5dd47a052cd098937d66394ed04c66831ee5972 100644
--- a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index d91a351c5969e71385348b76376202c14e86daac..4d736a21602b34b560ea1c8d9ede4645d806ca29 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 0c0e548a909a0c87c622449c8ac6f66db29b5b8d..f1f3f199df137b83193c4d1e974dfb401d9ec9ff 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
-#define LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/graph/graph.h"
@@ -153,4 +153,4 @@ class FuzzStringInputOp : public FuzzSession {
 }  // end namespace fuzzing
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
index ac3a12aa399a3efe532c71c49a092b6cecd6059b..5c3fc4a2795430d1f8f269f42131e882106db7b0 100644
--- a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index 978fcd102822a6a2690478eaca473eabc6ae83ab..c90ad2cfeb7222f4c75e718fcaea6955567f3a4a 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 7d1aa1fbf3a149d25e82b454543a5add522145af..738d78e99a0081a2b9f0f59c94433372acec19e2 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
index 94255d215e5292bf77ab1104eb1d36c0cc1d661c..e98363ffbf166782649f3fa12dc2ab70024908cf 100644
--- a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index dde08b37eacb9edada92f98c5115f694015aad34..e6fefe643b72bd5a169f0c152ac2fee2568462aa 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_INDEX(T, Index)                             \
-  template <>                                                         \
-  int64 GatherFunctor<GPUDevice, T, Index>::operator()(               \
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                               \
+  template <>                                                           \
+  int64 GatherFunctor<GPUDevice, T, Index>::operator()(                 \
       OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor Tparams, \
-      typename TTypes<Index>::ConstFlat Tindices,                     \
-      typename TTypes<T, 3>::Tensor Tout);                            \
+      typename TTypes<Index>::ConstFlat Tindices,                       \
+      typename TTypes<T, 3>::Tensor Tout);                              \
   extern template struct GatherFunctor<GPUDevice, T, Index>;
 
 #define DECLARE_GPU_SPECS(T)         \
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 1e429a037e8b16f5e01766125e1d10ec7567d78d..16ccb03b8502dd626c0dc4f0c10fcfe50224c7b8 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -52,21 +52,23 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   const size_t slice_bytes = slice_elems * sizeof(T);
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
-  // Store the value of invalidate index for printing error information, it's a shared variable.
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
   SliceIndex result = -1;
-  auto work = [&] (int64 start, int64 end) {
+  auto work = [&](int64 start, int64 end) {
     SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
     SliceIndex indices_idx = static_cast<SliceIndex>(start % indices_size);
     SliceIndex batch_idx_end = static_cast<SliceIndex>(end / indices_size);
     SliceIndex indices_idx_end = static_cast<SliceIndex>(end % indices_size);
 
     while ((batch_idx < batch_idx_end) ||
-            (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
+           (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
       SliceIndex i_next = indices_idx + 1;
       SliceIndex b_next = batch_idx + 1;
       if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
-              (i_next < indices_size)) {
-        port::prefetch<port::PREFETCH_HINT_T0>(&params(batch_idx, indices(i_next), 0));
+          (i_next < indices_size)) {
+        port::prefetch<port::PREFETCH_HINT_T0>(
+            &params(batch_idx, indices(i_next), 0));
         port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
         b_next = batch_idx;
       } else if (b_next <= batch_idx_end) {
@@ -85,11 +87,12 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
       // ahead-of-time compilation binary size).
       if (is_simple_type<T>::value) {
         // Avoid auto-promotion to Index from SliceIndex by casting.
-        memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
-               params_base + (batch_idx * static_cast<SliceIndex>(limit) +
-                              static_cast<SliceIndex>(index)) *
-                             slice_elems,
-               slice_bytes);
+        memcpy(
+            out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
+            params_base + (batch_idx * static_cast<SliceIndex>(limit) +
+                           static_cast<SliceIndex>(index)) *
+                              slice_elems,
+            slice_bytes);
       } else {
         // For non-"simple" types (e.g. strings).
         out.template chip<1>(indices_idx) = params.template chip<1>(index);
@@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
     }
   };
 
-  Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size,
-        slice_elems * sizeof(T), work);
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * indices_size, slice_elems * sizeof(T), work);
   return result;
 }
 
@@ -117,16 +120,16 @@ struct GatherFunctorCPU {
     bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
                       params.size() > std::numeric_limits<int32>::max() ||
                       N > std::numeric_limits<int32>::max());
-#define CALL(elems)                                                        \
-  do {                                                                     \
-    if (use_large) {                                                       \
-      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices,   \
-                                                   slice_size, out);       \
-    } else {                                                               \
-      const int32 small_slice = static_cast<int32>(slice_size);            \
-      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices,   \
-                                                   small_slice, out);      \
-    }                                                                      \
+#define CALL(elems)                                                      \
+  do {                                                                   \
+    if (use_large) {                                                     \
+      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices, \
+                                                   slice_size, out);     \
+    } else {                                                             \
+      const int32 small_slice = static_cast<int32>(slice_size);          \
+      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices, \
+                                                   small_slice, out);    \
+    }                                                                    \
   } while (0)
 
     if (slice_size == 10)
@@ -143,7 +146,8 @@ struct GatherFunctorCPU {
 
 template <typename Device, typename T, typename Index>
 struct GatherFunctor {
-  int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params,
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
                    typename TTypes<T, 3>::Tensor out);
 };
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 239d5d2e990a88bbc8ca5949a07a2aa2a75de2ba..0a38d3d4af92bcd228f8d5c8e98b95fa05839e64 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/mem.h"
@@ -106,8 +108,7 @@ class GatherOp : public OpKernel {
       auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
-                            indices_flat, out_flat);
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
@@ -142,6 +143,7 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
+TF_CALL_variant(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index f0d7c670a62bf0a520cb37f01beda530d157d5c7..4040bf52bffe638d601f954f9a81d9eda78346a6 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -46,7 +46,7 @@ GraphTransferUtils::GetTopNFloatResults(const float* const data,
       GetTopNFloatResults(data, labels, element_count);
   LOG(INFO) << "=== Dump ranking ===";
   for (int i = 0; i < top_n; ++i) {
-    const std::tuple<float, int, string> &entry = queue.top();
+    const std::tuple<float, int, string>& entry = queue.top();
     LOG(INFO) << i << ": " << std::get<1>(entry) << ", " << std::get<2>(entry)
               << ", " << std::get<0>(entry);
     queue.pop();
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index a360d188cc2246b87af348db9958152418742822..0d43d028cdbea02b820d8ac0c48378524e875e78 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -181,8 +181,8 @@ class GraphTransferer {
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
 
-  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
-                              const int id, const Node& node);
+  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner, const int id,
+                              const Node& node);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> BuildShapeArray(
       const shape_inference::ShapeHandle& shape_handle,
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 536d295506c9669b0434059e26094cb70a4f1e87..20b09f144bab5482f2cf1bfa86cf22f0b7ff815e 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -42,8 +42,7 @@ constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
 
 class GraphTransfererTest : public ::testing::Test {
  protected:
-  void SetUp() final {
-  }
+  void SetUp() final {}
 
   GraphTransferer gt_;
 };
@@ -61,7 +60,7 @@ class TestGraphTransferOpsDefinitions : public IRemoteFusedGraphOpsDefinitions {
       }
     }
     return -1;
-}
+  }
 
  private:
   const std::vector<string> op_types_{"INPUT",   "OUTPUT",  "Conv2D",
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 71bc4187b74cd6501d203aa3779c6d01e01f0d38..3f794dfb1a04cfdd6f7c114e0b2c7c0aac319a61 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -420,7 +420,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -487,7 +487,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -556,7 +556,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi0 = gt0.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
@@ -576,7 +576,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi1 = gt1.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index 789a7ce7a3d8ec9e5d918dd75fce8d644a3b5682..d303e9c877e7b7be05205003c26cf66ef8273416 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
     // valid value for new dual = 0
     // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
     const double candidate_optimal_dual =
-        current_dual +
-        (label - wx) /
-            (num_loss_partitions * example_weight * weighted_example_norm);
+        current_dual + (label - wx) / (num_loss_partitions * example_weight *
+                                       weighted_example_norm);
     if (label * candidate_optimal_dual < 0) {
       return 0.0;
     }
diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
index c2bb958be8b29c4a6df99cf5533748d7db73179c..a88e9b0ddcdda660cf34a88253ef7c8d1e28029c 100644
--- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/histogram_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "external/cub_archive/cub/device/device_histogram.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/histogram_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
         /* num_samples */ num_samples,
         /* stream */ stream);
     if (err != cudaSuccess) {
-      return errors::Internal("Could not launch HistogramRange: ",
-                              cudaGetErrorString(err), ".");
+      return errors::Internal(
+          "Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
     }
 
     return Status::OK();
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index f088315ff538e821666aa95d9a4c4ed49f7c0b59..faf997be05cccc366bcab618c99c8d39ff25e18b 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -109,8 +109,9 @@ struct ImageResizerState {
     ValidateAndCalculateOutputSize(context, input);
     if (!context->status().ok()) return;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), out_height,
-                                                out_width, input.dim_size(3)}),
+                                0,
+                                TensorShape({input.dim_size(0), out_height,
+                                             out_width, input.dim_size(3)}),
                                 &output));
   }
 
@@ -168,8 +169,9 @@ struct ImageResizerGradientState {
         CalculateResizeScale(original_width, resized_width, align_corners_);
     output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({batch_size, original_height,
-                                                original_width, channels}),
+                                0,
+                                TensorShape({batch_size, original_height,
+                                             original_width, channels}),
                                 &output));
   }
 
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index e2861ae090ccd48c0408b83a7bc7c0230bf2c1a5..c37055239c28e0ab243ea30b05b2c8af0905766c 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -98,36 +98,36 @@ class InTopK : public OpKernel {
   int k_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 7728ba850c94aa79feb31d137712692df0f89176..a71d047ed1a381bfc0311f86987f585f51b02536 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -27,13 +27,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SyclDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
 template <typename Device, typename T>
-Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
-                              int32 loc, Tensor* output) {
+Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
+                              Tensor* output) {
   auto Tvalue = value.shaped<T, 2>({1, value.NumElements()});
   auto Toutput = output->flat_outer_dims<T>();
   auto nrows = Toutput.dimension(0);
@@ -74,7 +74,7 @@ Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -207,7 +207,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         ParallelConcatUpdate<CPUDevice>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index f8ed9351579ff8cbeeb5f45030e8ff278fa75101..f561287f7a142f4cbcf74225c3f2fde3986c169a 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/l2loss_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 36907fb5716fcde3b0efc28cc4edca543432c8f4..b58bcf583480cb50ee7a6be13465e6c6d301295b 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -108,7 +108,6 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
-
 }
 
 template <typename Scalar>
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 31a427f2c90ad8a321d6004bf7ef85772d8e951f..2474fe4d564b37a7de36a85a6af3820e2bc4ac65 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 #include <sys/stat.h>
@@ -26,9 +26,8 @@ namespace tensorflow {
 
 class LMDBReader : public ReaderBase {
  public:
-  LMDBReader(const string& node_name, Env* env)
+  LMDBReader(const string& node_name, Env* /*unused*/)
       : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
-        env_(env),
         mdb_env_(nullptr),
         mdb_dbi_(0),
         mdb_txn_(nullptr),
@@ -77,15 +76,13 @@ class LMDBReader : public ReaderBase {
         *at_end = true;
         return Status::OK();
       }
-    }
-    else {
+    } else {
       if (Seek(MDB_NEXT) == false) {
         *at_end = true;
         return Status::OK();
       }
     }
-    *key = string(static_cast<const char*>(mdb_key_.mv_data),
-                  mdb_key_.mv_size);
+    *key = string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
     *value = string(static_cast<const char*>(mdb_value_.mv_data),
                     mdb_value_.mv_size);
     *produced = true;
@@ -109,7 +106,6 @@ class LMDBReader : public ReaderBase {
     }
   }
 
-  Env* const env_;
   MDB_env* mdb_env_;
   MDB_dbi mdb_dbi_;
 
@@ -123,13 +119,10 @@ class LMDBReaderOp : public ReaderOpKernel {
   explicit LMDBReaderOp(OpKernelConstruction* context)
       : ReaderOpKernel(context) {
     Env* env = context->env();
-    SetReaderFactory([this, env]() {
-      return new LMDBReader(name(), env);
-    });
+    SetReaderFactory([this, env]() { return new LMDBReader(name(), env); });
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
-                        LMDBReaderOp);
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), LMDBReaderOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 2765f42bbdc2d3bf3b9ec42f9f225166218fa9d0..6479e6f5dc3795451babd5675f1decc05b670251 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -122,10 +122,9 @@ class LogisticLossUpdater : public DualLossUpdater {
                              num_loss_partitions * weighted_example_norm *
                                  example_weight *
                                  (0.5 * (1 + tanhx) / label - current_dual);
-    const double denominator = -2 * label -
-                               num_loss_partitions * weighted_example_norm *
-                                   example_weight * (1 - tanhx * tanhx) * 0.5 /
-                                   label;
+    const double denominator =
+        -2 * label - num_loss_partitions * weighted_example_norm *
+                         example_weight * (1 - tanhx * tanhx) * 0.5 / label;
     return x - numerator / denominator;
   }
 };
diff --git a/tensorflow/core/kernels/loss_test.cc b/tensorflow/core/kernels/loss_test.cc
index 89f0677e1f5a7a0301c2d85700ee9954869c50bb..460d65c5c270c43aae4cb8b26b5258c7d4dd9a5f 100644
--- a/tensorflow/core/kernels/loss_test.cc
+++ b/tensorflow/core/kernels/loss_test.cc
@@ -32,14 +32,17 @@ namespace {
 
 TEST(LogisticLoss, ComputePrimalLoss) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.693147, loss_updater.ComputePrimalLoss(
-                            0 /* wx */, 1 /* label */, 1 /* example weight */),
+  EXPECT_NEAR(0.693147,
+              loss_updater.ComputePrimalLoss(0 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
+                                             1 /* example weight */),
               1e-3);
 }
 
@@ -53,31 +56,35 @@ TEST(LogisticLoss, ComputeDualLoss) {
               loss_updater.ComputeDualLoss(1 /* current dual */, 1 /* label */,
                                            1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(-0.693147, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                      1 /* label */,
-                                                      1 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -0.693147,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1 /* label */,
+                                   1 /* example weight */),
+      1e-3);
 }
 
 TEST(LogisticLoss, ComputeUpdatedDual) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.479, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.479,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.031, loss_updater.ComputeUpdatedDual(
-                          2 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, 0.1 /* current_dual */,
-                          -0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.031,
+              loss_updater.ComputeUpdatedDual(
+                  2 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, 0.1 /* current_dual */,
+                  -0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SquaredLoss, ComputePrimalLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(40.5,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -95,43 +102,50 @@ TEST(SquaredLoss, ComputePrimalLoss) {
 
 TEST(SquaredLoss, ComputeDualLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(0.66, loss_updater.ComputeDualLoss(0.2 /* current dual */,
-                                                 -1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-0.375, loss_updater.ComputeDualLoss(1.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      0.66,
+      loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -0.375,
+      loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SquaredLoss, ComputeUpdatedDual) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(HingeLoss, ComputePrimalLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(1.0, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(1.0,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -149,10 +163,11 @@ TEST(HingeLoss, ComputePrimalLoss) {
 
 TEST(HingeLoss, ComputeDualLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -163,10 +178,11 @@ TEST(HingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.5, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                 1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.5,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(HingeLoss, ConvertLabel) {
@@ -195,28 +211,31 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // weighted_example_norm=100.0, it turns out that the optimal value to update
   // the dual to is 0.507 which is within the permitted range and thus should be
   // the value returned.
-  EXPECT_NEAR(0.507, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 100.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.507,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 100.0 /* weighted_example_norm */),
               1e-3);
   // When label=-1.0, example_weight=1.0, current_dual=0.4, wx=0.6,
   // weighted_example_norm=10.0 and num_loss_partitions=10, it turns out that
   // the optimal value to update the dual to is 0.384 which is within the
   // permitted range and thus should be the value returned.
-  EXPECT_NEAR(-0.416, loss_updater.ComputeUpdatedDual(
-                          10 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.6 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.416,
+              loss_updater.ComputeUpdatedDual(
+                  10 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.6 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
   // When label=1.0, example_weight=1.0, current_dual=-0.5, wx=0.3 and
   // weighted_example_norm=10.0, it turns out that the optimal value to update
   // the dual to is -0.43. However, this is outside the allowed [0.0, 1.0] range
   // and hence the closest permitted value (0.0) should be returned instead.
-  EXPECT_NEAR(0.0, loss_updater.ComputeUpdatedDual(
-                       1 /* num partitions */, 1.0 /* label */,
-                       1.0 /* example weight */, -0.5 /* current_dual */,
-                       0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, -0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
   // When label=-1.0, example_weight=2.0, current_dual=-1.0, wx=0.3 and
@@ -224,17 +243,19 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // the dual to is -1.065. However, this is outside the allowed [-1.0, 0.0]
   // range and hence the closest permitted value (-1.0) should be returned
   // instead.
-  EXPECT_NEAR(-1.0, loss_updater.ComputeUpdatedDual(
-                        1 /* num partitions */, -1.0 /* label */,
-                        2.0 /* example weight */, -1.0 /* current_dual */,
-                        0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-1.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, -1.0 /* label */,
+                  2.0 /* example weight */, -1.0 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputePrimalLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -252,10 +273,11 @@ TEST(SmoothHingeLoss, ComputePrimalLoss) {
 
 TEST(SmoothHingeLoss, ComputeDualLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -266,24 +288,27 @@ TEST(SmoothHingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputeUpdatedDual) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c905ebc84a6e9251a5e30be19b086d3fae215cad..c3a59c95762ad03f217768a9b14e31d6f501d789 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -229,10 +229,11 @@ class LRNOp : public OpKernel {
   explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
@@ -247,9 +248,10 @@ class LRNOp : public OpKernel {
     const Tensor& in = context->input(0);
     OP_REQUIRES(context, in.dims() == 4,
                 errors::InvalidArgument("in must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(in.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
     // Cast to platform-specific int to avoid conversion warnings.
     const int batch = static_cast<int>(in.dim_size(0));
     const int rows = static_cast<int>(in.dim_size(1));
@@ -448,10 +450,11 @@ class LRNGradOp : public OpKernel {
   explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 5eb060f6641d1565417dd074a95bf72e2a81e472..cdff7bad5fe222b6f0824a742caa0a4e5d939f71 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -45,15 +45,14 @@ class MatchingFilesOp : public OpKernel {
     int num_files = 0;
     std::vector<std::vector<string>> all_fnames(num_patterns);
     for (int i = 0; i < num_patterns; i++) {
-      OP_REQUIRES_OK(
-          context,
-          context->env()->GetMatchingPaths(patterns(i), &all_fnames[i]));
+      OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i),
+                                                               &all_fnames[i]));
       num_files += all_fnames[i].size();
     }
     Tensor* output_t = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       "filenames", TensorShape({num_files}), &output_t));
+    OP_REQUIRES_OK(
+        context, context->allocate_output("filenames", TensorShape({num_files}),
+                                          &output_t));
     auto output = output_t->vec<string>();
     int index = 0;
     for (int i = 0; i < num_patterns; ++i) {
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index cb68690f2847709fe6ff38f3eecd974613856dcf..f499ce6519d097c7fea05e8175d08d102880f7fd 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -261,12 +261,12 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
       std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
     using perftools::gputools::blas::AlgorithmConfig;
     using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
     using perftools::gputools::blas::kDefaultAlgorithm;
     using perftools::gputools::blas::kDefaultBlasGemm;
     using perftools::gputools::blas::kDefaultBlasGemv;
     using perftools::gputools::blas::kNoAlgorithm;
+    using perftools::gputools::blas::ProfileResult;
+    using perftools::gputools::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 6398da2fb959b0bded9afad8c92be923e44c755c..628895ca86f9c86c5bda987dcade9a4a7af753d8 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -30,7 +30,8 @@ struct MatMulTypes {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
       out_type;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                           Eigen::Aligned> in_type;
+                           Eigen::Aligned>
+      in_type;
 };
 
 template <typename Device, typename In0, typename In1, typename Out,
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index d7fff4bb0c2b03bdfa2845f3ff89d938e07466e1..1439141f6493943c94516e6f0f9c05e8314401d5 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -62,7 +62,15 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in.shape()),
                 errors::InvalidArgument("num_lower must be scalar, got shape ",
                                         num_lower_in.shape().DebugString()));
-    const int64 num_lower = num_lower_in.scalar<int64>()();
+
+    auto as_int64_scalar = [](const Tensor& tensor) -> int64 {
+      if (tensor.dtype() == DT_INT32) {
+        return tensor.scalar<int32>()();
+      } else {
+        return tensor.scalar<int64>()();
+      }
+    };
+    const int64 num_lower = as_int64_scalar(num_lower_in);
     OP_REQUIRES(
         context, num_lower <= input_reshaped.dimension(1),
         errors::InvalidArgument(
@@ -73,7 +81,7 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in.shape()),
                 errors::InvalidArgument("num_upper must be scalar, got shape ",
                                         num_upper_in.shape().DebugString()));
-    const int64 num_upper = num_upper_in.scalar<int64>()();
+    const int64 num_upper = as_int64_scalar(num_upper_in);
     OP_REQUIRES(context, num_upper <= input_reshaped.dimension(2),
                 errors::InvalidArgument("num_upper must be negative or less or "
                                         "equal to number of columns (",
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc
index 4cc3f32f7e4a727fa2d9ec3c21a3750111f46392..99db898301378f7ad55f75b3a403a09a5f59bb3b 100644
--- a/tensorflow/core/kernels/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/matrix_exponential_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.exp();
   }
@@ -51,9 +51,9 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<float>), float);
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<double>), double);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/matrix_logarithm_op.cc
index cf0007b5b6776d0c8a297067f3a49ca21a132ac0..22ca094e2432723a49afab8a255339fc8ac2512e 100644
--- a/tensorflow/core/kernels/matrix_logarithm_op.cc
+++ b/tensorflow/core/kernels/matrix_logarithm_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.log();
   }
@@ -53,9 +53,9 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
 // logarithm. If all eigenvalues are positive, then this returns the correct
 // logarithm, however checking for positive definiteness adds significant
 // overhead. Therefore at present we only register this Op for complex types.
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 9dd665392bc33e1559d46d0e7be2277e8c22a20a..502d593474e06cc495854706a1d4d90014ea8f96 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -69,8 +69,8 @@ class MatrixSetDiagOp : public OpKernel {
                 errors::InvalidArgument(
                     "must have diagonal.shape == input.shape[:-2] + "
                     "min(input.shape[-2:]), but received input shape: ",
-                    input_shape.DebugString(), " and diagonal shape: ",
-                    diag_shape.DebugString()));
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
 
     if (input.NumElements() == 0) {
       // This is a no-op.
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 2eefadad4949fd8d78f6a27533ce0385c38d9c69..9be7408012bb81e80c73c29a6ee9bb6763c04490 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/maxpooling_op.h"
 
 #include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -89,7 +89,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
   //    max value.
   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
                 &output_arg_max, &out_backprop](int64 start, int64 limit) {
-
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
@@ -180,7 +179,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
-
   };
 
   const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
@@ -567,7 +565,7 @@ class MaxPoolingGradGradOp : public OpKernel {
     //    tensor_out_as_matrix with the corresponding values in
     //    top_diff_as_matrix.
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f8daaca4c94aada5dbae5e5582f0da075b7222d5..0c7a236b2ff0f0b5c6287d1dffb1e8ef9bac7cc0 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -450,10 +450,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-    kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                     kThreadsPerBlock, 0, d.stream()>>>(
-                                        output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 9fed01189fc3bfde4ad1e23ea8fda0c76311b3bc..39e60c9fcef174a4f9e2271600ed847f4e769625 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -98,9 +98,9 @@ typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
 template <typename Context, typename Params>
 void MultiThreadGemm(Context* context, const Params& params) {
   if (params.m <= 4) {
-      gemmlowp::meta::MultiThreadGemm<
-          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
-          1, 8, 8>(context, params);
+    gemmlowp::meta::MultiThreadGemm<
+        Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, 1,
+        8, 8>(context, params);
   } else {
     if (params.m >= params.n) {
       gemmlowp::meta::MultiThreadGemm<
diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc
index 2793005aa2678b4017dc7a562b8362470e43b8ed..8c755e0df87546ab5f85c3ac5ce2d895d020de78 100644
--- a/tensorflow/core/kernels/mfcc.cc
+++ b/tensorflow/core/kernels/mfcc.cc
@@ -27,21 +27,19 @@ const double kFilterbankFloor = 1e-12;
 const int kDefaultFilterbankChannelCount = 40;
 const int kDefaultDCTCoefficientCount = 13;
 
-Mfcc::Mfcc() : initialized_(false),
-               lower_frequency_limit_(kDefaultLowerFrequencyLimit),
-               upper_frequency_limit_(kDefaultUpperFrequencyLimit),
-               filterbank_channel_count_(kDefaultFilterbankChannelCount),
-               dct_coefficient_count_(kDefaultDCTCoefficientCount) { }
+Mfcc::Mfcc()
+    : initialized_(false),
+      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+      filterbank_channel_count_(kDefaultFilterbankChannelCount),
+      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
 
-bool Mfcc::Initialize(int input_length,
-                      double input_sample_rate) {
-  bool initialized = mel_filterbank_.Initialize(input_length,
-                                                input_sample_rate,
-                                                filterbank_channel_count_,
-                                                lower_frequency_limit_,
-                                                upper_frequency_limit_);
-  initialized &= dct_.Initialize(filterbank_channel_count_,
-                                 dct_coefficient_count_);
+bool Mfcc::Initialize(int input_length, double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(
+      input_length, input_sample_rate, filterbank_channel_count_,
+      lower_frequency_limit_, upper_frequency_limit_);
+  initialized &=
+      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
   initialized_ = initialized;
   return initialized;
 }
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index 8268f4720348bbc820bd3f8863698d34999abb7b..8eee76f7f0cadad45cb223ab9fbb990e4c365a44 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -20,18 +20,17 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mfcc_dct.h"
 #include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 
 class Mfcc {
  public:
   Mfcc();
-  bool Initialize(int input_length,
-                  double input_sample_rate);
+  bool Initialize(int input_length, double input_sample_rate);
 
   // Input is a single squared-magnitude spectrogram frame. The input spectrum
   // is converted to linear magnitude and weighted into bands using a
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
index 630de8a5a3362b77306ac76b70bbb63416d561d0..3db3b51e8b665f6e28ccb2bf8f3850785c7561fb 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -38,13 +38,12 @@ namespace tensorflow {
 
 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
 
-bool MfccMelFilterbank::Initialize(int input_length,
-                               double input_sample_rate,
-                               int output_channel_count,
-                               double lower_frequency_limit,
-                               double upper_frequency_limit) {
+bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
+                                   int output_channel_count,
+                                   double lower_frequency_limit,
+                                   double upper_frequency_limit) {
   num_channels_ = output_channel_count;
-  sample_rate_  = input_sample_rate;
+  sample_rate_ = input_sample_rate;
   input_length_ = input_length;
 
   if (num_channels_ < 1) {
@@ -85,10 +84,9 @@ bool MfccMelFilterbank::Initialize(int input_length,
   }
 
   // Always exclude DC; emulate HTK.
-  const double hz_per_sbin = 0.5 * sample_rate_ /
-      static_cast<double>(input_length_ - 1);
-  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit /
-                                           hz_per_sbin));
+  const double hz_per_sbin =
+      0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
   end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
 
   // Maps the input spectrum bin indices to filter bank channels/indices. For
@@ -121,12 +119,12 @@ bool MfccMelFilterbank::Initialize(int input_length,
       weights_[i] = 0.0;
     } else {
       if (channel >= 0) {
-        weights_[i] = (center_frequencies_[channel + 1] -
-                       FreqToMel(i * hz_per_sbin)) /
+        weights_[i] =
+            (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
             (center_frequencies_[channel + 1] - center_frequencies_[channel]);
       } else {
         weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
-            (center_frequencies_[0] - mel_low);
+                      (center_frequencies_[0] - mel_low);
       }
     }
   }
@@ -152,16 +150,16 @@ bool MfccMelFilterbank::Initialize(int input_length,
     }
   }
   if (!bad_channels.empty()) {
-    LOG(ERROR) << "Missing " << bad_channels.size() << " bands " <<
-        " starting at " << bad_channels[0] <<
-        " in mel-frequency design. " <<
-        "Perhaps too many channels or " <<
-        "not enough frequency resolution in spectrum. (" <<
-        "input_length: " << input_length <<
-        " input_sample_rate: " << input_sample_rate <<
-        " output_channel_count: " << output_channel_count <<
-        " lower_frequency_limit: " << lower_frequency_limit <<
-        " upper_frequency_limit: " << upper_frequency_limit;
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
+               << " starting at " << bad_channels[0]
+               << " in mel-frequency design. "
+               << "Perhaps too many channels or "
+               << "not enough frequency resolution in spectrum. ("
+               << "input_length: " << input_length
+               << " input_sample_rate: " << input_sample_rate
+               << " output_channel_count: " << output_channel_count
+               << " lower_frequency_limit: " << lower_frequency_limit
+               << " upper_frequency_limit: " << upper_frequency_limit;
   }
   initialized_ = true;
   return true;
@@ -171,7 +169,7 @@ bool MfccMelFilterbank::Initialize(int input_length,
 // square root, then summing FFT magnitudes under triangular integration windows
 // whose widths increase with frequency.
 void MfccMelFilterbank::Compute(const std::vector<double> &input,
-                            std::vector<double> *output) const {
+                                std::vector<double> *output) const {
   if (!initialized_) {
     LOG(ERROR) << "Mel Filterbank not initialized.";
     return;
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 1bdc2dc93b80a2691d4adec219426b142ef24321..37c3936e80d893a3c12b153ea92749ec4b73f872 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -27,10 +27,8 @@ class MfccMelFilterbank {
  public:
   MfccMelFilterbank();
   bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
-                  double input_sample_rate,
-                  int output_channel_count,
-                  double lower_frequency_limit,
-                  double upper_frequency_limit);
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
 
   // Takes a squared-magnitude spectrogram slice as input, computes a
   // triangular-mel-weighted linear-magnitude filterbank, and places the result
@@ -56,7 +54,7 @@ class MfccMelFilterbank {
   // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
   std::vector<int> band_mapper_;
   int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
-  int end_index_;  // Highest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
 
   TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
 };
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
index 602dfeb4e5400143a10232219f02c8e5d8154a04..54f31e1699ef1843d942f952f540b2d657b2d063 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@@ -34,11 +34,9 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {
     input.push_back(i + 1);
   }
   const int kChannelCount = 20;
-  filterbank.Initialize(input.size(),
-                        22050 /* sample rate */,
-                        kChannelCount /* channels */,
-                        20.0 /*  lower frequency limit */,
-                        4000.0 /* upper frequency limit */);
+  filterbank.Initialize(
+      input.size(), 22050 /* sample rate */, kChannelCount /* channels */,
+      20.0 /*  lower frequency limit */, 4000.0 /* upper frequency limit */);
 
   std::vector<double> output;
   filterbank.Compute(input, &output);
@@ -65,13 +63,10 @@ TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) {
   std::vector<double> input;
   std::vector<double> output;
 
-  filterbank.Initialize(kSampleCount,
-                        22050 /* sample rate */,
-                        20 /* channels */,
-                        20.0 /*  lower frequency limit */,
+  filterbank.Initialize(kSampleCount, 22050 /* sample rate */,
+                        20 /* channels */, 20.0 /*  lower frequency limit */,
                         4000.0 /* upper frequency limit */);
 
-
   // First call with nonzero input value, and an empty output vector,
   // will resize the output and fill it with the correct, nonzero outputs.
   input.assign(kSampleCount, 1.0);
diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc
index cb32df8811ed04363fd61490e3253dd31539460d..72c1d331d6e7bd91385aa268d7b59bbd786859b4 100644
--- a/tensorflow/core/kernels/mfcc_test.cc
+++ b/tensorflow/core/kernels/mfcc_test.cc
@@ -36,11 +36,10 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) {
   std::vector<double> output;
   mfcc.Compute(input, &output);
 
-  std::vector<double> expected = {29.13970072, -6.41568601, -0.61903012,
-                             -0.96778652, -0.26819878, -0.40907028,
-                             -0.15614748, -0.23203119, -0.10481487,
-                             -0.1543029,  -0.0769791,  -0.10806114,
-                             -0.06047613};
+  std::vector<double> expected = {
+      29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+      -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+      -0.0769791,  -0.10806114, -0.06047613};
 
   ASSERT_EQ(expected.size(), output.size());
   for (int i = 0; i < output.size(); ++i) {
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index fbdeaf43ebbfdcf6b76f97046130f40cf8c8efd1..26e1082989f317a35d55826a466cb8d9ef306c4c 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -87,8 +87,8 @@ class MirrorPadOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(context,
                     before <= in0.dim_size(d) && after <= in0.dim_size(d),
@@ -296,8 +296,8 @@ class MirrorPadGradOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("Paddings must be non-negative: ",
-                                          before, ", ", after));
+                  errors::InvalidArgument(
+                      "Paddings must be non-negative: ", before, ", ", after));
 
       const int64 out_size = in0.dim_size(d) - (before + after);
       if (offset_ == 0) {  // SYMMETRIC mode.
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 89d37d2f874c0b8fa7550b1c49c0e3c4106e2ee5..ef724f0a296577539fa33176e7f1a4cd55e8c663 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 using mkldnn::sum;
@@ -37,7 +37,7 @@ using mkldnn::sum;
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
@@ -285,7 +285,7 @@ class MklAddNOp : public OpKernel {
   } MklAddNOpContext;
 };
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -317,8 +317,11 @@ class MklAddNOp : public OpKernel {
                                                 : src2_tensor.dims();
       // if the shapes of two tensors are not same raise op error
       TensorShape src1_shape, src2_shape;
-      src1_shape = src1_tensor.shape();
-      src2_shape = src2_tensor.shape();
+      src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape()
+                  : src1_tensor.shape();
+      src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape()
+                  : src2_tensor.shape();
+
       if (!src1_shape.IsSameSize(src2_shape)) {
         ctx->SetStatus(errors::InvalidArgument(
             "Inputs to operation ", this->name(), " of type ",
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d751a70fc86b40d8ca656322484848cf906359fd..cff1bd18a74841d91acc98e0d3cc90041a0e7142 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,24 +24,23 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-using mkldnn::memory;
+using mkldnn::algorithm;
+using mkldnn::engine;
 using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
+using mkldnn::memory;
 using mkldnn::padding_kind;
-using mkldnn::engine;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-using mkldnn::algorithm;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
@@ -358,10 +357,11 @@ class MklAvgPoolingGradOp : public OpKernel {
       if (!outbackprop_in_mkl_format) {
         // For avgpooling, tensor_in_shape should have 1 dimension, and 4
         // elements.
-        OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-                                 tensor_in_shape.NumElements() == 4,
-                    errors::InvalidArgument("original input shape must be "
-                                            "1-dimensional and 4 elements"));
+        OP_REQUIRES(
+            context,
+            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+            errors::InvalidArgument("original input shape must be "
+                                    "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
         OP_REQUIRES(context, out_backprop.dims() == 4,
@@ -429,13 +429,15 @@ class MklAvgPoolingGradOp : public OpKernel {
 };  // MklAvgPoolingGradOp
 
 
-#else  // INTEL_MKL_DNN is defined
+
+#else
+
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklAvgPoolingOp(OpKernelConstruction* context)
-  : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // Workspace is an MKLDNN construct that is only used in Max Pooling.
     // So set workspace_enabled_ to false.
     this->workspace_enabled_ = false;
@@ -444,8 +446,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-              this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -457,9 +459,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                          input_tensor, &pool_params,
-                          &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -467,59 +468,77 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
+      // If input is an empty tensor, allocate an empty output tensor and return
+      if (input_tensor.NumElements() == 0) {
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        TensorShape output_tf_shape;
+        if (pool_params.data_format == TensorFormat::FORMAT_NCHW) {
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
+        } else {
+          memory::dims output_dims_NHWC_order;
+          output_dims_NHWC_order = {pool_params.tensor_in_batch,
+                                    static_cast<int>(pool_params.out_height),
+                                    static_cast<int>(pool_params.out_width),
+                                    pool_params.out_depth};
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order);
+        }
+        const int kOutputIndex = 0;
+        AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor,
+                                    output_tf_shape, output_mkl_shape);
+        CHECK_NOTNULL(output_tensor);
+        return;
+      }
+
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to AvgPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                static_cast<memory::format>(dnn_data_input.GetUsrMemDesc()
-                    .data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
 
       } else {
-          dnn_data_output.SetUsrMem(output_dims_mkl_order,
-              this->data_format_mkldnn_);
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
       }
 
-        // describe the memory layout
+      // describe the memory layout
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
       // 3. create a pooling primitive descriptor
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              dnn_data_input.GetUsrMemDesc(),
-              dnn_data_output.GetUsrMemDesc(),
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc,
-                                                 cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       CHECK_NOTNULL(output_tensor);
 
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
-      this->PrepareAndExecuteNet(pool_prim_desc,
-                                &dnn_data_input,
-                                &dnn_data_output);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Operation received an exception:",
-                                         error_msg));
+      this->PrepareAndExecuteNet(pool_prim_desc, &dnn_data_input,
+                                 &dnn_data_output);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }  // Compute
-};  // MklAvgPoolingOp
+};   // MklAvgPoolingOp
 
 //-----------------------------------------------------------------------------
 
@@ -527,27 +546,23 @@ template <class Device, class T>
 class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
-      const Tensor& tensor_in_shape = MklGetInput(context,
-          kInputTensorIndexInputShape);
-      const Tensor& input_gradient_tensor = MklGetInput(context,
-          kInputTensorIndexInputGradient);
+      const Tensor& tensor_in_shape =
+          MklGetInput(context, kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor =
+          MklGetInput(context, kInputTensorIndexInputGradient);
       GetMklShape(context, kInputTensorIndexInputShape,
-            &original_input_mkl_shape);
+                  &original_input_mkl_shape);
       GetMklShape(context, kInputTensorIndexInputGradient,
-            &input_gradient_mkl_shape);
+                  &input_gradient_mkl_shape);
 
-
-      SanityCheckInputs(context, tensor_in_shape,
-                        input_gradient_tensor,
-                        original_input_mkl_shape,
-                        input_gradient_mkl_shape);
+      SanityCheckInputs(context, tensor_in_shape, input_gradient_tensor,
+                        original_input_mkl_shape, input_gradient_mkl_shape);
       if (!context->status().ok()) return;
 
       // Used to allocate output_diff_src/diff_src
@@ -562,90 +577,70 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolParameters pool_params;
       memory::dims output_dims_mkl_order, original_input_dims_nchw;
       // Configure the original input memory descriptor
-      memory::desc original_input_md = ConfigureOriginalInput(context,
-                                      tensor_in_shape,
-                                      original_input_mkl_shape,
-                                      &original_input_dims_nchw,
-                                      &pool_params,
-                                      &original_input_shape);
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, tensor_in_shape, original_input_mkl_shape,
+          &original_input_dims_nchw, &pool_params, &original_input_shape);
 
       // configure the original output memory descriptor
       // by definition, the shape of the original output is the same
       // as the shape of the gradient diff_dst
       memory::desc original_output_md = this->ConfigureOriginalOutput(
-                pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+          pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
 
       memory::desc target_diff_dst_md = this->ConfigureInputGradient(
-                                    input_gradient_mkl_shape,
-                                    input_gradient_tensor,
-                                    &input_gradient_diff_dst,
-                                    original_output_md);
+          input_gradient_mkl_shape, input_gradient_tensor,
+          &input_gradient_diff_dst, original_output_md);
       // The shape of the output diff src needs to be the same shape as the
       // original input. But we will set its format to be same as the format of
       // input gradient. We won't use format of original input since it will
       // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
       // the input rather than actual input).
-      output_diff_src.SetUsrMem(original_input_dims_nchw,
-                                static_cast<memory::format>(
-                                  target_diff_dst_md.data.format));
+      output_diff_src.SetUsrMem(
+          original_input_dims_nchw,
+          static_cast<memory::format>(target_diff_dst_md.data.format));
 
       // Create the forward pooling primitive descriptor so we can reference it
       // in the backward pooling primitive descriptor
-      auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              original_input_md,
-              original_output_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_fwd_prim_desc
-              = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                  cpu_engine);
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          original_input_md, original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
 
       auto pool_bkwd_desc = pooling_backward::desc(
-              algorithm::pooling_avg_exclude_padding,
-              output_diff_src.GetUsrMemDesc(),
-              target_diff_dst_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_bkwd_prim_desc
-                = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                              cpu_engine,
-                                              pool_fwd_prim_desc);
-      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-                      original_input_dims_nchw,
-                      this->data_format_mkldnn_,
-                      &output_tensor_diff_src);
+          algorithm::pooling_avg_exclude_padding,
+          output_diff_src.GetUsrMemDesc(), target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+      this->AllocateOutputTensor(
+          context, pool_bkwd_prim_desc, original_input_dims_nchw,
+          this->data_format_mkldnn_, &output_tensor_diff_src);
 
       output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
 
-      this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                          &input_gradient_diff_dst,
-                          &output_diff_src,
-                          memory::primitive_desc(
-                              target_diff_dst_md,
-                              cpu_engine));
-    } catch (mkldnn::error &e) {
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &input_gradient_diff_dst, &output_diff_src,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine));
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                      ", message: " + string(e.message) +
-                      ", in file " + string(__FILE__) + ":" +
-                      std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                      errors::Aborted("Compute received an exception:",
-                                      error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
@@ -655,12 +650,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   const int kInputTensorIndexInputShape = 0;
   const int kInputTensorIndexInputGradient = 1;
 
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-        const Tensor& tensor_original_input_shape,
-        const MklDnnShape& original_input_mkl_shape,
-        memory::dims* original_input_dims_mkl_order,
-        MklPoolParameters* pool_params,
-        TensorShape* input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_mkl_order);
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(input_tensor_shape);
@@ -672,47 +666,47 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
     }
 
     return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                              context,
-                                              tensor_original_input_shape,
-                                              original_input_mkl_shape,
-                                              original_input_dims_mkl_order,
-                                              pool_params,
-                                              *input_tensor_shape);
-}
+        context, tensor_original_input_shape, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
 
   void SanityCheckInputs(OpKernelContext* context,
-                        const Tensor& tensor_in_shape,
-                        const Tensor& input_gradient_tensor,
-                        const MklDnnShape& original_input_mkl_shape,
-                        const MklDnnShape& input_gradient_mkl_shape) {
+                         const Tensor& tensor_in_shape,
+                         const Tensor& input_gradient_tensor,
+                         const MklDnnShape& original_input_mkl_shape,
+                         const MklDnnShape& input_gradient_mkl_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-          tensor_in_shape.NumElements() == 4,
+      OP_REQUIRES(
+          context,
+          tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
           errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+                                  "1-dimensional and 4 elements"));
     } else {
-      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
-          original_input_mkl_shape.DimSize(0) == 4,
-          errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+      OP_REQUIRES(context,
+                  original_input_mkl_shape.GetDimension() == 1 &&
+                      original_input_mkl_shape.DimSize(0) == 4,
+                  errors::InvalidArgument("original input shape must be "
+                                          "1-dimensional and 4 elements"));
     }
 
     if (!input_gradient_mkl_shape.IsMklTensor()) {
       // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
       OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     }
   }
 };  // MklAvgPoolingGradOp
 
 
 
-#endif  // INTEL_MKL_DNN
+
+#endif  // INTEL_MKL_ML
+
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
@@ -728,4 +722,3 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 9fee94f946555480fce8acf904a7909622404524..d9713075be6e20b77ea681a0e71baa21b7b9eea9 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define MKL_Complex8 tensorflow::complex64
 #define MKL_Complex16 tensorflow::complex128
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d109bb6bcfe6360af12086bad452752336357f35..f1f267e849aa39b43c153b857493160e0d103970 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -30,11 +30,11 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::concat;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -45,7 +45,6 @@ typedef std::vector<TensorShape> TensorShapeList;
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
-
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
 // reference inputs.
 // --------------------------------------------------------------------------
@@ -63,7 +62,7 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
@@ -152,8 +151,8 @@ class EigenConcatBaseOp : public OpKernel {
 
 #else  // MKL_DNN
 
-void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
-                        const TensorShapeList& input_shapes) {
+  void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+               const TensorShapeList& input_shapes) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
         AxisArgName == NAME_IS_AXIS
@@ -197,7 +196,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
       const auto in = values[i];
       const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
       OP_REQUIRES(
-          c, (input_shapes[i].dims() == input_dims) ||
+          c,
+          (input_shapes[i].dims() == input_dims) ||
               (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
@@ -208,8 +208,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
       }
-      output_concat_dim += input_shapes[i].dims() > 0 ?
-                           input_shapes[i].dim_size(axis) : 1;
+      output_concat_dim +=
+          input_shapes[i].dims() > 0 ? input_shapes[i].dim_size(axis) : 1;
     }
 
     TensorShape output_shape(input_shape);
@@ -230,7 +230,7 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
 #endif
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
@@ -418,7 +418,6 @@ class MklConcatOp : public OpKernel {
     OP_REQUIRES_OK(context, context->status());
   }
 
-
  private:
   typedef struct {
     TensorFormat data_format;
@@ -590,39 +589,45 @@ class MklConcatOp : public OpKernel {
       GetMklShapeList(context, "values", &input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
-                    ? MklGetInput(context, 0) : MklGetInput(context, N);
+                                            ? MklGetInput(context, 0)
+                                            : MklGetInput(context, N);
       // Sanity checks
-      OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor.shape().DebugString()));
-      int32 concat_dim = internal::SubtleMustCopy(
-                           concat_dim_tensor.scalar<int32>()());
+      OP_REQUIRES(
+          context, IsLegacyScalar(concat_dim_tensor.shape()),
+          errors::InvalidArgument(
+              "Concat dim tensor should be a scalar integer, but got shape ",
+              concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
 
       // check that ranks of all tensors match
       // and that their shapes match except for concat_dim.
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor() ?
-                                         input_shapes[0].GetTfShape() :
-                                         input_tensors[0].shape();
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
       for (auto& s : input_shapes) {
-        if (s == expected_shape) {++i; continue;}
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
 
-        TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                      input_tensors[i].shape();
+        TensorShape s_shape =
+            s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
 
-        OP_REQUIRES(context, s_dims == expected_dims,
-                  errors::InvalidArgument(
-                      "_MklConcatOp : Ranks of all input tensors should match:"
-                      " input dimensions = ",
-                      s_dims, " vs. expected rank = ", expected_dims));
+        OP_REQUIRES(
+            context, s_dims == expected_dims,
+            errors::InvalidArgument(
+                "_MklConcatOp : Ranks of all input tensors should match:"
+                " input dimensions = ",
+                s_dims, " vs. expected rank = ", expected_dims));
 
         for (int d = 0; d < expected_dims; ++d) {
           if (d == concat_dim) continue;
@@ -630,10 +635,11 @@ class MklConcatOp : public OpKernel {
           size_t expected_size = expected_shape.dim_size(d);
           size_t s_size = s_shape.dim_size(d);
           OP_REQUIRES(
-            context, expected_size == s_size,
-            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
-                    "should match: shape[0][", d, "]= ", expected_size,
-                    " vs. shape[", i, "][", d, "] = ", s_size));
+              context, expected_size == s_size,
+              errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                                      "should match: shape[0][",
+                                      d, "]= ", expected_size, " vs. shape[", i,
+                                      "][", d, "] = ", s_size));
         }
 
         if (s.IsMklTensor())
@@ -657,8 +663,8 @@ class MklConcatOp : public OpKernel {
         TensorShapeList tf_input_shapes;
         i = 0;
         for (auto& s : input_shapes) {
-          TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                                input_tensors[i].shape();
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
           tf_input_shapes.push_back(s_shape);
           ++i;
         }
@@ -678,21 +684,22 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k =0; k < N; k++) {
+      for (int k = 0; k < N; k++) {
         bool is_mkl_tensor = input_shapes[k].IsMklTensor();
         memory::dims src_dims;
 
         // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor) ?
-                   TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) :
-                   TFShapeToMklDnnDims(input_tensors[k].shape());
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
 
         dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-          // It does not matter what data format we use here (NHWC or NCHW).
-          // We just need to ensure that output of Concat uses same data format
-          // as input.
-                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
 
         srcs[k].SetUsrMem(src_md, &input_tensors[k]);
         auto src_mpd = srcs[k].GetUsrMemPrimDesc();
@@ -707,14 +714,15 @@ class MklConcatOp : public OpKernel {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
         auto orig_tf_format = input_shapes[0].GetTfDataFormat();
-        dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims,
-                               MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        dst_dims_in_nchw = MklDnnDimsInNCHW(
+            dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
         // We will set the output in the same format as input to avoid layout
         // conversions.
         // Currently we are setting dst format same as input format.
         // See if we can make this choice in a better way.
-        dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
-                 (memory::format) input_shapes[0].GetMklLayout().data.format);
+        dst_md = memory::desc(
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
       } else {
         // Again, format does not matter here. We just need to make it same as
         // input format.
@@ -722,7 +730,7 @@ class MklConcatOp : public OpKernel {
       }
 
       std::vector<primitive::at> inputs;
-      for (int k=0; k < input_tensors.size(); k++)
+      for (int k = 0; k < input_tensors.size(); k++)
         inputs.push_back(srcs[k].GetOpMem());
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
@@ -732,8 +740,7 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs)
-        concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -752,24 +759,25 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
       }
-      AllocateOutputSetMklShape(context, 0, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, 0, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
       CHECK_NOTNULL(dst_tensor);
 
-      dst_md = dnn_shape_dst.IsMklTensor() ?
-               dnn_shape_dst.GetMklLayout() : dst_md;
+      dst_md =
+          dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout() : dst_md;
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
       std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-               ", message: " + string(e.message) + ", in file " +
-               string(__FILE__) + ":" + std::to_string(__LINE__);
-        OP_REQUIRES_OK(context, errors::Aborted(
-                "Operation received an exception:", error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -790,11 +798,9 @@ class MklConcatOp : public OpKernel {
     dnn_shape_output.SetDimensions(4);
     Tensor* output_tensor = nullptr;
     TensorShape tf_shape_output;
-    tf_shape_output.AddDim(
-        dnn_shape_output.GetSerializeBufferSize());
-    context->allocate_output(
-        GetTensorMetaDataIndex(0, context->num_outputs()),
-        tf_shape_output, &output_tensor);
+    tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()),
+                             tf_shape_output, &output_tensor);
     dnn_shape_output.SerializeMklDnnShape(
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 0f1a218fe62dd91160320254342828811e3aa458..25c2573741265d4d33c9c91474792be241dd3b32 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -38,9 +38,9 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 54d4916d4943be4957bb60b273cdbf2d6ce1ffdc..1401bc65a45bd80ed78230840cf0b9958b1f012e 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,24 +38,24 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_weights;
 using mkldnn::memory;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropFilterOp : public OpKernel {
@@ -360,8 +360,8 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      void* mkl_buf_out_backprop = const_cast<void*>(static_cast<const void*>(
-                                      out_backprop.flat<T>().data()));
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
 
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
@@ -371,10 +371,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                      lt_out_backprop, mkl_lt_internal_out_backprop),
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-            lt_out_backprop, &mkl_buf_convert_out_backprop);
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
@@ -428,18 +429,18 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
 #else
 
 template <typename Device, class T, bool biasEnabled>
-class MklConv2DCustomBackpropFilterOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropFilterOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropFilterOp() {}
 
  private:
@@ -447,7 +448,7 @@ class MklConv2DCustomBackpropFilterOp :
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
     CHECK(!filter_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropFilter: filter should not be in MKL Layout";
+        << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
@@ -462,8 +463,10 @@ class MklConv2DCustomBackpropFilterOp :
                                 const Tensor& filter_tensor) {
     TensorShape filter_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(
-             filter_tensor.vec<int32>(), &filter_tf_shape).ok(), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
+                                         &filter_tf_shape)
+                 .ok(),
+             true);
     return filter_tf_shape;
   }
 
@@ -485,16 +488,13 @@ class MklConv2DCustomBackpropFilterOp :
     return memory::format::hwio;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -508,34 +508,35 @@ class MklConv2DCustomBackpropFilterOp :
     int depth = 0;
     if (biasEnabled) {
       // Data structure for bias_grad
-      bias_grad = new MklDnnData<T> (&cpu_engine);
+      bias_grad = new MklDnnData<T>(&cpu_engine);
       TensorShape obp_tf_shape = GetTfShape(context, 2);
-      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat()
-                == FORMAT_NCHW) ?
-          obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat() ==
+               FORMAT_NCHW)
+                  ? obp_tf_shape.dim_size(1)
+                  : obp_tf_shape.dim_size(3);
       memory::dims bias_grad_dims = {depth};
       bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
     }
 
     // Create convolution backward weights primitive.
-    auto bwd_desc = (biasEnabled && (bias_grad != nullptr))?
-        convolution_backward_weights::desc(convolution_direct,
-                                input->GetOpMemDesc(), output->GetOpMemDesc(),
-                                bias_grad->GetOpMemDesc(),
-                                outbackprop->GetOpMemDesc(), strides, padding_l,
-                                padding_r, padding) :
-        convolution_backward_weights::desc(convolution_direct,
-                          input->GetOpMemDesc(), output->GetOpMemDesc(),
-                          outbackprop->GetOpMemDesc(), strides, padding_l,
-                          padding_r, padding);
-
-    auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
-                                                            cpu_engine,
-                                                            conv_fwd_pd);
+    auto bwd_desc =
+        (biasEnabled && (bias_grad != nullptr))
+            ? convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), bias_grad->GetOpMemDesc(),
+                  outbackprop->GetOpMemDesc(), strides, padding_l, padding_r,
+                  padding)
+            : convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides,
+                  padding_l, padding_r, padding);
+
+    auto bwd_pd = convolution_backward_weights::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
 
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
@@ -548,8 +549,8 @@ class MklConv2DCustomBackpropFilterOp :
       AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
       memory::dims bias_grad_dims = {depth};
       // Since Bias is 1D, we use format::x from MKLDNN to represent it.
-      auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType<T>(),
-                                       memory::format::x);
+      auto bias_grad_md =
+          memory::desc({bias_grad_dims}, MklDnnType<T>(), memory::format::x);
       bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
       bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
     }
@@ -562,28 +563,29 @@ class MklConv2DCustomBackpropFilterOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-
-      // For BackpropFilter, we convert the output tensor back in Tensorflow
-      // layout. Because typically, BackpropFilter is the last operator in the
-      // graph that emit filter gradient that is provided to ApplyGradient
-      // method to update the filter. But it may be possible to eliminate this
-      // by forwarding filter in MKL layout if we support ApplyGradient method
-      // for MKL layout propagation.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(false);
-      // output_dims_mkl_order is in OIHW format.
-      // Allocate shape of TF tensor in HWIO format.
-      TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
-                                   output_dims_mkl_order[MklDnnDims::Dim_W],
-                                   output_dims_mkl_order[MklDnnDims::Dim_I],
-                                   output_dims_mkl_order[MklDnnDims::Dim_O]});
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout. Because typically, BackpropFilter is the last operator in the
+    // graph that emit filter gradient that is provided to ApplyGradient
+    // method to update the filter. But it may be possible to eliminate this
+    // by forwarding filter in MKL layout if we support ApplyGradient method
+    // for MKL layout propagation.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    // output_dims_mkl_order is in OIHW format.
+    // Allocate shape of TF tensor in HWIO format.
+    TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                 output_dims_mkl_order[MklDnnDims::Dim_W],
+                                 output_dims_mkl_order[MklDnnDims::Dim_I],
+                                 output_dims_mkl_order[MklDnnDims::Dim_O]});
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Allocate tensor for bias grad
@@ -600,9 +602,9 @@ class MklConv2DCustomBackpropFilterOp :
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  MklDnnData<T>* input, MklDnnData<T>* obp,
-                  MklDnnData<T>* output, MklDnnData<T>* bias_grad = nullptr) {
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output,
+      MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
@@ -612,15 +614,15 @@ class MklConv2DCustomBackpropFilterOp :
     // For BackpropFilter, we convert the output tensor back in Tensorflow
     // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_weights_primitive_desc());
+        conv_pd.diff_weights_primitive_desc());
 
     if (biasEnabled && (bias_grad != nullptr)) {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem(),
-                                      bias_grad->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem(),
+          bias_grad->GetOpMem()));
     } else {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
     }
 
     if (output_reorder_required) {
@@ -631,27 +633,29 @@ class MklConv2DCustomBackpropFilterOp :
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);\
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>); \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);             \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>);              \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index ef6db58d31f125487bd5beefb53710569b0584d8..eeed0095310280997ebb2ec3e848451df378c4fa 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -41,22 +43,20 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_data;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropInputOp : public OpKernel {
@@ -359,16 +359,15 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropInputOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropInputOp() {}
 
  private:
-  const int kInputIndex_Filter = 1,
-            kInputIndex_InputSizes = 0,
+  const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0,
             kInputIndex_OutBackProp = 2;
   void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
                          const MklDnnShape& filter_mkl_shape,
@@ -377,7 +376,7 @@ class MklConv2DCustomBackpropInputOp :
     // of the Tensor and never an actual tensor. So it will never be in MKL
     // layout.
     CHECK(!input_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropInput: input should not be in MKL Layout";
+        << "Conv2DBackpropInput: input should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return kInputIndex_InputSizes; }
@@ -386,8 +385,10 @@ class MklConv2DCustomBackpropInputOp :
                                const Tensor& input_tensor) {
     TensorShape input_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec<int32>(),
-                                         &input_tf_shape).ok(), true);
+    CHECK_EQ(
+        TensorShapeUtils::MakeShape(input_tensor.vec<int32>(), &input_tf_shape)
+            .ok(),
+        true);
     return input_tf_shape;
   }
 
@@ -414,16 +415,13 @@ class MklConv2DCustomBackpropInputOp :
     return data_format;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -434,19 +432,16 @@ class MklConv2DCustomBackpropInputOp :
     CHECK_NOTNULL(output_tensor);
 
     // Create convolution backward data primitive.
-    auto bwd_desc = convolution_backward_data::desc(convolution_direct,
-                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
-                      outbackprop->GetOpMemDesc(), strides, padding_l,
-                      padding_r, padding);
-
-    auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
-                                                          cpu_engine,
-                                                          conv_fwd_pd);
+    auto bwd_desc = convolution_backward_data::desc(
+        convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(),
+        outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding);
 
+    auto bwd_pd = convolution_backward_data::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor in TensorFlow and MKL layout.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
     output->SetUsrMemDataHandle(*output_tensor);
@@ -455,50 +450,50 @@ class MklConv2DCustomBackpropInputOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-
-      // Output primitive descriptor for backward data is diff_src.
-      auto dst_pd = conv_pd.diff_src_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                                   output_dims_mkl_order, output_tf_format);
-
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
-
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_data::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // Output primitive descriptor for backward data is diff_src.
+    auto dst_pd = conv_pd.diff_src_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  MklDnnData<T>* filter, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+      const convolution_backward_data::primitive_desc& conv_pd,
+      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+    net.push_back(convolution_backward_data(
+        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
 
     stream(stream::kind::eager).submit(net).wait();
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_CPU_KERNELS(T)                                 \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 0e77b45993c17815889005c4d313c5489ae2f14b..cbda12689f842273b95debe62c05b6f6edf48e9d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,14 +42,16 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 
 
-#ifdef INTEL_MKL_DNN
+
+#ifndef INTEL_MKL_ML
+
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -59,8 +61,8 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
@@ -116,18 +118,19 @@ class MklConv2DOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
                             : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, input_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, input_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter.dim_size(2)));
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
 
@@ -136,9 +139,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_rows_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
                             : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
@@ -147,9 +151,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_cols_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
                             : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -157,9 +162,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_batch_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
                             : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(input_batch_raw);
 
     // For now we take the stride from the second and third dimensions only (we
@@ -313,8 +319,7 @@ class MklConv2DOp : public OpKernel {
     // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
         mkl_tmp_bias_buf_tensor;
-    mkl_context.MklPrepareConvolutionInputs(context,
-                                            &mkl_tmp_input_buf_tensor,
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
                                             &mkl_tmp_bias_buf_tensor);
 
@@ -398,8 +403,9 @@ class MklConv2DOp : public OpKernel {
       mkl_convert_input =
           !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                 lt_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -517,8 +523,8 @@ class MklConv2DOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-            errors::InvalidArgument("Filter should not be in "
-            "Mkl Layout"));
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
@@ -531,11 +537,10 @@ class MklConv2DOp : public OpKernel {
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
-      conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape,
-                                         &src_dims, &filter_dims, &strides,
-                                         &output_dims_tf_order,
-                                         &output_dims_mkl_order, &padding_l,
-                                         &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -543,21 +548,20 @@ class MklConv2DOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 ||
-          output_dims_tf_order[0] == 0) {
+      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                    src_tf_shape, output_mkl_shape);
+                                  src_tf_shape, output_mkl_shape);
 
         // MklConv2D also outputs converted filter as 2nd output of Conv2D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor,
-                                  filter_tf_shape, filter_mkl_shape);
+                                  &output_filter_tensor, filter_tf_shape,
+                                  filter_mkl_shape);
         return;
       }
 
@@ -570,14 +574,15 @@ class MklConv2DOp : public OpKernel {
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout (NHWC or NCHW depending on data format).
       auto src_md = src_mkl_shape.IsMklTensor()
-                    ? src_mkl_shape.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
       src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
-                    ? filter_mkl_shape.GetMklLayout()
-          : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio);
+                           ? filter_mkl_shape.GetMklLayout()
+                           : memory::desc(filter_dims, MklDnnType<T>(),
+                                          memory::format::hwio);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
       // Set output shape (output_dims) required in MKL-DNN order.
@@ -601,34 +606,34 @@ class MklConv2DOp : public OpKernel {
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            bias.GetOpMemDesc(), output.GetOpMemDesc(), strides,
-            padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
-        AllocateOutputTensor(context, conv_prim_desc,
-                             output_dims_mkl_order, tf_fmt, &output_tensor);
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
         // Set data handle for output.
         output.SetUsrMemDataHandle(output_tensor);
 
         Tensor* filter_out_tensor = nullptr;
         AllocateFilterOutputTensor(context, conv_prim_desc,
-                        TFShapeToMklDnnDims(filter_tf_shape),
-                        &filter_out_tensor);
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
 
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            &bias, &output, filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
+                             filter_out_tensor);
       } else {
         // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            output.GetOpMemDesc(), strides, padding_l, padding_r,
-            TFPaddingToMklDnnPadding(padding_));
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
+            padding_r, TFPaddingToMklDnnPadding(padding_));
 
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
         AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
                              tf_fmt, &output_tensor);
         // Set data handle for output.
@@ -636,18 +641,18 @@ class MklConv2DOp : public OpKernel {
 
         Tensor* filter_out_tensor = nullptr;
         AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            nullptr, &output, filter_out_tensor);
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output,
+                             filter_out_tensor);
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -655,71 +660,67 @@ class MklConv2DOp : public OpKernel {
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
-  const int kInputIndex_Src = 0,
-            kInputIndex_Filter = 1,
-            kInputIndex_Bias = 2;
+  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
 
   // Allocate output tensor.
   void AllocateOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-      auto dst_pd = conv_prim_desc.dst_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-              output_dims_mkl_order, output_tf_format);
-
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
-
-      AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                                output_tf_shape, output_mkl_shape);
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   // Allocate output tensor.
   void AllocateFilterOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& filter_dims_tf_order,
-                  Tensor** filter_tensor) {
-      CHECK_NOTNULL(filter_tensor);
-      auto filter_pd = conv_prim_desc.weights_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape filter_mkl_shape;
-      filter_mkl_shape.SetMklTensor(true);
-      filter_mkl_shape.SetMklLayout(&filter_pd);
-      filter_mkl_shape.SetElemType(MklDnnType<T>());
-
-      // The format of the filter is actually OIhw8i8o, but TF doesn't support
-      // this format. Just use format::blocked for now because the layout
-      // is stored in the MKL data.
-      filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
-                  filter_dims_tf_order, memory::format::blocked);
-
-      // Allocate the data space for the filter to propagate as TF tensor.
-      TensorShape filter_tf_shape;
-      filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
-
-      AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
-              filter_tf_shape, filter_mkl_shape);
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) {
+    CHECK_NOTNULL(filter_tensor);
+    auto filter_pd = conv_prim_desc.weights_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape filter_mkl_shape;
+    filter_mkl_shape.SetMklTensor(true);
+    filter_mkl_shape.SetMklLayout(&filter_pd);
+    filter_mkl_shape.SetElemType(MklDnnType<T>());
+
+    // The format of the filter is actually OIhw8i8o, but TF doesn't support
+    // this format. Just use format::blocked for now because the layout
+    // is stored in the MKL data.
+    filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
+                                 filter_dims_tf_order, memory::format::blocked);
+
+    // Allocate the data space for the filter to propagate as TF tensor.
+    TensorShape filter_tf_shape;
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
+                              filter_tf_shape, filter_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* filter,
-                  MklDnnData<T>* bias, MklDnnData<T>* output,
-                  Tensor* filter_out_tensor) {
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
+      MklDnnData<T>* output, Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -731,18 +732,20 @@ class MklConv2DOp : public OpKernel {
     // rather than re-order to a temp buffer, reorder directly to the
     // filter output tensor
     filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(),
-                    filter->GetTensorBuffer(filter_out_tensor), &net);
+                                filter->GetTensorBuffer(filter_out_tensor),
+                                &net);
 
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), bias->GetOpMem(),
-                                    output->GetOpMem()));
+                                        filter->GetOpMem(), bias->GetOpMem(),
+                                        output->GetOpMem()));
     } else {
       CHECK_EQ(biasEnabled, false);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+                                        filter->GetOpMem(),
+                                        output->GetOpMem()));
     }
 
     stream(stream::kind::eager).submit(net).wait();
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index c6456bd5c330d8a5672a99dc7f649f3bab4d3519..9dd88221a84671e1f69df13cca1b62b2ce65bb4e 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
-#include <vector>
 #include <limits>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -40,19 +40,19 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #endif
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 class MklDnnConvUtil {
  protected:
@@ -63,13 +63,13 @@ class MklDnnConvUtil {
 
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
-                 Padding pad, TensorFormat fm) : context_(context),
-    strides_(strides), padding_(pad), data_format_(fm) {}
+                 Padding pad, TensorFormat fm)
+      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
   // Calculate Convolution strides
-  virtual inline void GetStridesInMklOrder(memory::dims *strides) {
+  virtual inline void GetStridesInMklOrder(memory::dims* strides) {
     // For now we take the stride from the second and third dimensions only
     // (we do not support striding on the batch or depth dimension).
     CHECK_NOTNULL(strides);
@@ -82,14 +82,14 @@ class MklDnnConvUtil {
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetInputSizeInMklOrder(const TensorShape& input_shape,
-                         memory::dims *input_dims) {
-  #define CHECK_BOUNDS(val, err_msg) do {                     \
-    OP_REQUIRES(context_, FastBoundsCheck(val,                \
-                            std::numeric_limits<int>::max()), \
-                errors::InvalidArgument(err_msg));            \
-  }while(0)
+  virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
+                                             memory::dims* input_dims) {
+#define CHECK_BOUNDS(val, err_msg)                                     \
+  do {                                                                 \
+    OP_REQUIRES(context_,                                              \
+                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));                     \
+  } while (0)
 
     CHECK_NOTNULL(input_dims);
 
@@ -112,7 +112,7 @@ class MklDnnConvUtil {
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
-  #undef CHECK_BOUNDS
+#undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
     std::vector<int> mkldnn_sizes(4, -1);
@@ -138,10 +138,9 @@ class MklDnnConvUtil {
   // forward gets actual tensor as input).
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetFilterSizeInMklOrder(const TensorShape& input_shape,
-                          const TensorShape& filter_shape,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                                              const TensorShape& filter_shape,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == 4,
@@ -149,17 +148,18 @@ class MklDnnConvUtil {
                                         filter_shape.DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context_,
+                  FastBoundsCheck(filter_shape.dim_size(i),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(
-        context_, input_depth == filter_shape.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter_shape.dim_size(2)));
+    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter_shape.dim_size(2)));
 
     // TF filter is always in (rows, cols, in_depth, out_depth) order.
     int filter_rows = static_cast<int>(filter_shape.dim_size(0));
@@ -182,25 +182,24 @@ class MklDnnConvUtil {
   // requires filter in OIHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
+                                              size_t filter_index,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
     GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
-                            GetTfShape(context_, filter_index),
-                            filter_dims);
+                            GetTfShape(context_, filter_index), filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
   // anything, but sets error in context status.
-  virtual inline void
-  GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) {
+  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
+                                            memory::dims* bias_dims) {
     const Tensor& bias = MklGetInput(context_, bias_index);
     OP_REQUIRES(context_, bias.dims() == 1,
                 errors::InvalidArgument("bias must be 1-dimensional: ",
                                         bias.shape().DebugString()));
 
-    *bias_dims = { static_cast<int>(bias.dim_size(0)) };
+    *bias_dims = {static_cast<int>(bias.dim_size(0))};
   }
 
   // Function to calculate output and padding size for 2D convolution.
@@ -212,13 +211,11 @@ class MklDnnConvUtil {
   // status is returned via context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape,
-                                const TensorShape& filter_shape,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  virtual inline void GetOutputAndPadSizeInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      const memory::dims& strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -244,16 +241,16 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows,
-                                 padding_, &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols,
-                                 padding_, &out_cols, &pad_left, &pad_right));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_rows, filter_rows, stride_rows, padding_,
+                                 &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_cols, filter_cols, stride_cols, padding_,
+                                 &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape = ShapeFromFormat(data_format_, out_batch,
-                                            out_rows, out_cols, out_depth);
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
@@ -273,12 +270,10 @@ class MklDnnConvUtil {
   // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
   //
   // Function does not return anything, but sets error in context status.
-  inline void
-  GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -289,11 +284,11 @@ class MklDnnConvUtil {
 
     OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                          input_tf_shape.DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
-                                  strides, output_dims_tf_order,
-                                  output_dims_mkl_order, pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  output_dims_tf_order, output_dims_mkl_order,
+                                  pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -302,15 +297,12 @@ class MklDnnConvUtil {
   // also calculates strides and paddings for 2D Convolution.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape,
-                                        const TensorShape& filter_shape,
-                                        memory::dims *input_dims,
-                                        memory::dims *filter_dims,
-                                        memory::dims *strides,
-                                        memory::dims *output_dims_tf_order,
-                                        memory::dims *output_dims_mkl_order,
-                                        memory::dims *pad_l,
-                                        memory::dims *pad_r) {
+  inline void GetConvFwdSizesInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      memory::dims* input_dims, memory::dims* filter_dims,
+      memory::dims* strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -325,8 +317,7 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
-                                  output_dims_tf_order,
-                                  output_dims_mkl_order,
+                                  output_dims_tf_order, output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
   }
@@ -337,7 +328,7 @@ class MklDnnConvUtil {
 /////////////////////////////////////////////////////////////////////
 
 template <typename Device, class T>
-class MklConv2DBackpropCommonOp :  public OpKernel {
+class MklConv2DBackpropCommonOp : public OpKernel {
  public:
   ~MklConv2DBackpropCommonOp() {}
   explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
@@ -397,12 +388,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
           outbprop_tf_shape.num_elements() == 0) {
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
-        TensorShape output_tf_shape = GetOutputTfShape(input_tf_shape,
-                                                       filter_tf_shape,
-                                                       outbprop_tf_shape);
+        TensorShape output_tf_shape = GetOutputTfShape(
+            input_tf_shape, filter_tf_shape, outbprop_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor,
-                                    output_tf_shape, output_mkl_shape);
+                                  output_tf_shape, output_mkl_shape);
         CHECK_NOTNULL(output_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -421,12 +411,10 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims,
+          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
@@ -437,20 +425,22 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
       // construct input TF layout. For TF layout, although input shape
       // required is in MKL-DNN order, the layout is Tensorflow's layout
       // (NHWC or NCHW depending on data format).
-      auto fwd_input_md = input_mkl_shape.IsMklTensor() ?
-                          input_mkl_shape.GetMklLayout() :
-                       memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_input_md =
+          input_mkl_shape.IsMklTensor()
+              ? input_mkl_shape.GetMklLayout()
+              : memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
       // If filter is in MKL layout, then simply grab filter layout; otherwise
       // construct filter in TF layout. For TF layout, filter is in HWIO format.
-      auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ?
-                          filter_mkl_shape.GetMklLayout() :
-                          memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                       memory::format::hwio);
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
+                               ? filter_mkl_shape.GetMklLayout()
+                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                              memory::format::hwio);
       // Tensorflow Output of Conv2D is in data_format order.
       auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+      auto fwd_desc = convolution_forward::desc(
+          prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md,
+          fwd_out_md, strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
       auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
 
       // Create memory for user data. Describe how the inputs and outputs of
@@ -495,17 +485,16 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Operator-specific call to create and execute primitive.
       CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
-                      &outbackprop, &output, &output_tensor,
-                      strides, padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_),
+                      &outbackprop, &output, &output_tensor, strides, padding_l,
+                      padding_r, TFPaddingToMklDnnPadding(padding_),
                       bwd_output_dims, bwd_output_format);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -523,11 +512,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get TensorFlow shape of input tensor.
   virtual TensorShape MakeInputTfShape(OpKernelContext* context,
-                                      const Tensor& input_tensor) = 0;
+                                       const Tensor& input_tensor) = 0;
 
   /// Get TensorFlow shape of filter tensor.
   virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
-                                       const Tensor& filter_tensor) = 0;
+                                        const Tensor& filter_tensor) = 0;
 
   /// Get the TensorFlow shape of output tensor.
   virtual TensorShape GetOutputTfShape(const TensorShape& input_shape,
@@ -536,9 +525,9 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get shape of output in MKL-DNN order. Computes shape of output from
   /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
-  virtual
-  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
-                                    const memory::dims& fwd_filter_dims) = 0;
+  virtual const memory::dims& GetOutputDims(
+      const memory::dims& fwd_input_dims,
+      const memory::dims& fwd_filter_dims) = 0;
 
   /// Get data_format of output in MKL-DNN order. If output data format is
   /// same as input data format, then it simply returns value of data_format
@@ -546,24 +535,25 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
   virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
 
   /// Create and execute the primitive storing output in the output_tensor.
-  virtual void CreatePrimitive(OpKernelContext* context,
-    const engine& cpu_engine,
-    const convolution_forward::primitive_desc& conv_fwd_pd,
-    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
-    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
-    const memory::dims& padding_l, const memory::dims& padding_r,
-    padding_kind padding, const memory::dims& bwd_output_dims,
-    memory::format bwd_output_format) = 0;
+  virtual void CreatePrimitive(
+      OpKernelContext* context, const engine& cpu_engine,
+      const convolution_forward::primitive_desc& conv_fwd_pd,
+      MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+      MklDnnData<T>* output, Tensor** output_tensor,
+      const memory::dims& strides, const memory::dims& padding_l,
+      const memory::dims& padding_r, padding_kind padding,
+      const memory::dims& bwd_output_dims,
+      memory::format bwd_output_format) = 0;
 
   // Get the data_format {NCHW, NHWC}
-  TensorFormat GetTFDataFormat () { return data_format_; }
+  TensorFormat GetTFDataFormat() { return data_format_; }
 
  private:
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
@@ -575,12 +565,12 @@ class MklDummyOp : public OpKernel {
  public:
   ~MklDummyOp() {}
 
-  explicit MklDummyOp(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    TF_CHECK_OK(errors::Unimplemented("This is a dummy op."
-                                      "It should not have been invoked."));
+    TF_CHECK_OK(
+        errors::Unimplemented("This is a dummy op."
+                              "It should not have been invoked."));
   }
 };
 
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index c065724e0dbbe091d253eb2315c9a5f3c041d695..58f0c30f32b0eebd7ceff856b2e3bd881b28121c 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -1,4 +1,4 @@
-﻿/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0(the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 8340a91d059de16dfbabf53067f24fbca1bc1385..8313224d7fe3e2d307d3642ced5b277b95c85cdb 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,15 +25,15 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
+using mkldnn::batch_normalization_backward;
+using mkldnn::batch_normalization_forward;
 using mkldnn::prop_kind;
-using mkldnn::use_scale_shift;
+using mkldnn::stream;
 using mkldnn::use_global_stats;
-using mkldnn::batch_normalization_forward;
-using mkldnn::batch_normalization_backward;
+using mkldnn::use_scale_shift;
 #endif
 
 // TODO(inteltf) Address comments from PR 8968.
@@ -41,7 +41,7 @@ using mkldnn::batch_normalization_backward;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
@@ -601,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
       mkl_res_batchnorm_bwd[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
 
-     bool mkl_convert_out_backprop;
+      bool mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
       dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
       void* mkl_buf_converted_out_backprop = nullptr;
@@ -683,7 +683,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
 };
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
@@ -709,12 +709,11 @@ class MklFusedBatchNormOp : public OpKernel {
       const size_t kMeanIndex = 3;      // index of est_mean tensor
       const size_t kVarianceIndex = 4;  // index of est_variance tensor
 
-      const Tensor& src_tensor          = MklGetInput(context, kSrcIndex);
-      const Tensor& scale_tensor        = MklGetInput(context, kScaleIndex);
-      const Tensor& shift_tensor        = MklGetInput(context, kShiftIndex);
-      const Tensor& est_mean_tensor     = MklGetInput(context, kMeanIndex);
-      const Tensor& est_variance_tensor = MklGetInput(context,
-                                                      kVarianceIndex);
+      const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
+      const Tensor& shift_tensor = MklGetInput(context, kShiftIndex);
+      const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex);
+      const Tensor& est_variance_tensor = MklGetInput(context, kVarianceIndex);
 
       TensorShape tf_shape_src;
       MklDnnShape dnn_shape_src;
@@ -723,37 +722,34 @@ class MklFusedBatchNormOp : public OpKernel {
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
       OP_REQUIRES(context, shift_tensor.dims() == 1,
                   errors::InvalidArgument("offset must be 1-dimensional",
-                                        shift_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_mean must be 1-dimensional",
-                      est_mean_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_variance must be 1-dimensional",
-                      est_variance_tensor.shape().DebugString()));
+                                          shift_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_mean_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                  est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_variance_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                  est_variance_tensor.shape().DebugString()));
 
       if (is_training_) {
-        OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0,
-                    errors::InvalidArgument(
-                        "estimated_mean must be empty for training",
-                        est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, est_mean_tensor.dim_size(0) == 0,
+            errors::InvalidArgument("estimated_mean must be empty for training",
+                                    est_mean_tensor.shape().DebugString()));
         OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
                     errors::InvalidArgument(
                         "estimated_variance must be empty for training",
@@ -763,11 +759,9 @@ class MklFusedBatchNormOp : public OpKernel {
       // special case: input with 0 element and 0 batch size
       Tensor* dst_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0) {
-         HandleEmptyInput(context,
-                          tf_shape_src,
-                          scale_tensor.shape(),
-                          &dst_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &dst_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -783,11 +777,8 @@ class MklFusedBatchNormOp : public OpKernel {
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
-      AllocateTFOutputs(context,
-                        scale_tensor.shape(),
-                        &batch_mean_tensor,
-                        &batch_variance_tensor,
-                        &saved_mean_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
+                        &batch_variance_tensor, &saved_mean_tensor,
                         &saved_variance_tensor);
 
       if (is_training_)
@@ -815,69 +806,63 @@ class MklFusedBatchNormOp : public OpKernel {
         src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
                                              tensor_format_);
       } else {
-        src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
-                                             tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
       }
 
       auto src_md = dnn_shape_src.IsMklTensor()
-                    ? dnn_shape_src.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), format_m);
+                        ? dnn_shape_src.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), format_m);
       src.SetUsrMem(src_md, &src_tensor);
 
       // set weights primitive
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
-      T* weights_data = reinterpret_cast<T*>(
-                        weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(scale_tensor.flat<T>().data()));
-      T* shift_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(shift_tensor.flat<T>().data()));
-
-      for (int k=0; k < depth_; k++) {
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf =
+          reinterpret_cast<T*>(const_cast<T*>(shift_tensor.flat<T>().data()));
+
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = shift_tf[k];
       }
 
       // set mean primitive
-      auto mean_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto mean_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
-      char* saved_mean_data_tf = reinterpret_cast<char*>
-                                 (saved_mean_tensor->flat<T>().data());
-      std::memcpy(saved_mean_data_tf,
-                  reinterpret_cast<char*>(mean_values_),
-                  depth_*sizeof(T));
-      auto mean_m = memory(mean_pd,
-                           reinterpret_cast<void*>(saved_mean_data_tf));
+      char* saved_mean_data_tf =
+          reinterpret_cast<char*>(saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf, reinterpret_cast<char*>(mean_values_),
+                  depth_ * sizeof(T));
+      auto mean_m =
+          memory(mean_pd, reinterpret_cast<void*>(saved_mean_data_tf));
 
       // set variance primitive
-      auto variance_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto variance_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
-      char* saved_variance_data_tf = reinterpret_cast<char*>
-                  (saved_variance_tensor->flat<T>().data());
+      char* saved_variance_data_tf =
+          reinterpret_cast<char*>(saved_variance_tensor->flat<T>().data());
       std::memcpy(saved_variance_data_tf,
                   reinterpret_cast<char*>(variance_values_),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
       auto variance_m = memory(variance_pd, saved_variance_data_tf);
 
-      prop_kind pk = (is_training_) ?
-                     prop_kind::forward_training :
-                     prop_kind::forward_scoring;
+      prop_kind pk = (is_training_) ? prop_kind::forward_training
+                                    : prop_kind::forward_scoring;
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               pk, src.GetUsrMemDesc(), epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                             bnrm_fwd_desc, cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -887,47 +872,39 @@ class MklFusedBatchNormOp : public OpKernel {
         auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
-        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
-                                  src_dims, format_m);
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                  format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
 
       // Output of batchnorm has same shape as input.
       dst.SetUsrMem(src_md, dst_tensor);
 
       primitive bnrm_fwd_op;
       if (is_training_) {
-        bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          weights_m,
-                          dst.GetOpMem(),
-                          mean_m,
-                          variance_m);
+        bnrm_fwd_op =
+            batch_normalization_forward(bnrm_fwd_pd, src.GetOpMem(), weights_m,
+                                        dst.GetOpMem(), mean_m, variance_m);
       } else {
         bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          mean_m,
-                          variance_m,
-                          (const primitive::at) weights_m,
-                          dst.GetOpMem());
+            bnrm_fwd_pd, src.GetOpMem(), mean_m, variance_m,
+            (const primitive::at)weights_m, dst.GetOpMem());
       }
       std::vector<primitive> net;
       net.push_back(bnrm_fwd_op);
       stream(stream::kind::eager).submit(net).wait();
 
       // copy batch_mean data
-      T* batch_mean_data_tf = reinterpret_cast<T*>(
-                                batch_mean_tensor->flat<T>().data());
+      T* batch_mean_data_tf =
+          reinterpret_cast<T*>(batch_mean_tensor->flat<T>().data());
       std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
                   reinterpret_cast<char*>(mean_m.get_data_handle()),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
 
       // copy batch_variance data with Bessel's correction
       // if training mode is on
@@ -937,18 +914,17 @@ class MklFusedBatchNormOp : public OpKernel {
         size_t adjust_size = orig_size - 1;
         adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
       }
-      for (int k=0; k < depth_; k++)
+      for (int k = 0; k < depth_; k++)
         batch_variance_tensor->flat<T>().data()[k] =
-            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
-            * adjust_factor;
-    } catch (mkldnn::error &e) {
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k] *
+            adjust_factor;
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -958,7 +934,7 @@ class MklFusedBatchNormOp : public OpKernel {
   bool is_training_;
   T* mean_values_;
   T* variance_values_;
-  size_t depth_;          // batch normalization is done for per channel.
+  size_t depth_;  // batch normalization is done for per channel.
 
   void ExtractParams(OpKernelContext* context) {
     const Tensor& input = MklGetInput(context, 0);
@@ -966,23 +942,20 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
-    mean_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(mean.flat<T>().data()));
-    variance_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(variance.flat<T>().data()));
+    mean_values_ = reinterpret_cast<T*>(const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ =
+        reinterpret_cast<T*>(const_cast<T*>(variance.flat<T>().data()));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
-                        TensorShape tf_shape_scale,
-                        Tensor** dst_tensor) {
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale, Tensor** dst_tensor) {
     CHECK_NOTNULL(dst_tensor);
 
     const size_t kDstIndex = 0;
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor,
-                              tf_shape_src, dnn_shape_dst);
+    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src,
+                              dnn_shape_dst);
     CHECK_NOTNULL(*dst_tensor);
     memset(const_cast<char*>((*dst_tensor)->tensor_data().data()), 0,
            (*dst_tensor)->tensor_data().size());
@@ -991,15 +964,12 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* batch_variance_tensor = nullptr;
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale,
-                      &batch_mean_tensor,
-                      &batch_variance_tensor,
-                      &saved_mean_tensor,
+    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
+                      &batch_variance_tensor, &saved_mean_tensor,
                       &saved_variance_tensor);
   }
 
-  void AllocateTFOutputs(OpKernelContext* context,
-                         TensorShape tf_shape_scale,
+  void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1017,51 +987,43 @@ class MklFusedBatchNormOp : public OpKernel {
     // allocate batch mean output tensor
     MklDnnShape mkl_shape_batch_mean;
     mkl_shape_batch_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchMeanIndex,
-                              batch_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_batch_mean);
+    AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor,
+                              tf_shape_scale, mkl_shape_batch_mean);
     CHECK_NOTNULL(*batch_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_mean_tensor)->flat<T>().data()[k] = NAN;
 
     // allocate batch variance output tensor
     MklDnnShape mkl_shape_batch_variance;
     mkl_shape_batch_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchVarianceIndex,
-                              batch_variance_tensor,
-                              tf_shape_scale,
+    AllocateOutputSetMklShape(context, kBatchVarianceIndex,
+                              batch_variance_tensor, tf_shape_scale,
                               mkl_shape_batch_variance);
     CHECK_NOTNULL(*batch_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_variance_tensor)->flat<T>().data()[k] = NAN;
 
     // Mean and variance (without Bessel's correction) saved for backward
     // computation to serve as pre-computed mean and variance.
     MklDnnShape mkl_shape_saved_mean;
     mkl_shape_saved_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kSavedMeanIndex,
-                              saved_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_saved_mean);
+    AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor,
+                              tf_shape_scale, mkl_shape_saved_mean);
     CHECK_NOTNULL(*saved_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_mean_tensor)->flat<T>().data()[k] = NAN;
 
     MklDnnShape mkl_shape_saved_variance;
     mkl_shape_saved_variance.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kSavedVarianceIndex,
-                              saved_variance_tensor,
-                              tf_shape_scale,
+                              saved_variance_tensor, tf_shape_scale,
                               mkl_shape_saved_variance);
     CHECK_NOTNULL(*saved_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_variance_tensor)->flat<T>().data()[k] = NAN;
   }
 };
@@ -1093,8 +1055,8 @@ class MklFusedBatchNormGradOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
       const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
       const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex);
-      const Tensor& saved_variance_tensor = MklGetInput(context,
-                                            kVarianceIndex);
+      const Tensor& saved_variance_tensor =
+          MklGetInput(context, kVarianceIndex);
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, kSrcIndex, &dnn_shape_src);
@@ -1103,53 +1065,49 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       if (dnn_shape_diff_dst.IsMklTensor()) {
         tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape();
-        OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, dnn_shape_diff_dst.GetDimension() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       } else {
         tf_shape_diff_dst = diff_dst_tensor.shape();
-        OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, diff_dst_tensor.dims() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       }
 
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                         src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
 
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
-      OP_REQUIRES(context, saved_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved mean must be 1-dimensional",
-                       saved_mean_tensor.shape().DebugString()));
-
-      OP_REQUIRES(context, saved_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved variance must be 1-dimensional",
-                      saved_variance_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, saved_mean_tensor.dims() == 1,
+          errors::InvalidArgument("saved mean must be 1-dimensional",
+                                  saved_mean_tensor.shape().DebugString()));
+
+      OP_REQUIRES(
+          context, saved_variance_tensor.dims() == 1,
+          errors::InvalidArgument("saved variance must be 1-dimensional",
+                                  saved_variance_tensor.shape().DebugString()));
 
       Tensor* diff_src_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0 ||
           tf_shape_diff_dst.num_elements() == 0) {
-         HandleEmptyInput(context, tf_shape_src,
-                          scale_tensor.shape(),
-                          &diff_src_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &diff_src_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -1175,20 +1133,18 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       memory::dims src_dims, diff_dst_dims;
       if (dnn_shape_src.IsMklTensor())
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       dnn_shape_src.GetTfShape(), tensor_format_);
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
       else
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       src_tensor.shape(), tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
 
       if (dnn_shape_diff_dst.IsMklTensor())
         diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            dnn_shape_diff_dst.GetTfShape(),
-                            tensor_format_);
+            dnn_shape_diff_dst.GetTfShape(), tensor_format_);
       else
-        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            diff_dst_tensor.shape(),
-                            tensor_format_);
+        diff_dst_dims =
+            TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_);
 
       // set src and diff_dst primitives
       memory::desc src_md({}, memory::data_undef, memory::format_undef);
@@ -1202,7 +1158,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           src_md = diff_dst_md;
         }
       } else {
-        src_md =  memory::desc(src_dims, MklDnnType<T>(), format_m);
+        src_md = memory::desc(src_dims, MklDnnType<T>(), format_m);
         diff_dst_md = src_md;
       }
       src.SetUsrMem(src_md, &src_tensor);
@@ -1210,55 +1166,47 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       // weights -- DNN packs scales/shifts as weights in order of
       // scale, ..., scale, shift, ..., shift
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
       T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(const_cast<T*>
-                                        (scale_tensor.flat<T>().data()));
-      for (int k=0; k < depth_; k++) {
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = 0;
       }
 
       // set mean primitive
       memory::dims mv_dims = GetMeanVarianceDims();
-      mean.SetUsrMem(mv_dims,
-                     memory::format::nc,
-                     const_cast<void*>(static_cast<const void*>
-                     (saved_mean_tensor.flat<T>().data())));
+      mean.SetUsrMem(mv_dims, memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>(
+                         saved_mean_tensor.flat<T>().data())));
       mean.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set variance primitive
-      variance.SetUsrMem(mv_dims,  memory::format::nc,
-                         const_cast<void*>(static_cast<const void*>
-                         (saved_variance_tensor.flat<T>().data())));
+      variance.SetUsrMem(mv_dims, memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>(
+                             saved_variance_tensor.flat<T>().data())));
       variance.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set diff_weight primitive
-      auto diff_weights_desc = memory::desc(
-                                 {2, depth_},
-                                 MklDnnType<T>(),
-                                 memory::format::nc);
-      auto diff_weights_pd = memory::primitive_desc(
-                                diff_weights_desc,
-                                cpu_engine);
+      auto diff_weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto diff_weights_pd =
+          memory::primitive_desc(diff_weights_desc, cpu_engine);
       auto diff_weights_m = memory(diff_weights_pd);
 
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               prop_kind::forward_training,
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          prop_kind::forward_training, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                                bnrm_fwd_desc,
-                                cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // Indices of output tensors
-      const size_t kDiffSrcIndex = 0;    // index of diff_src tensor
+      const size_t kDiffSrcIndex = 0;  // index of diff_src tensor
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -1268,14 +1216,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
         auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(
-                              dnn_shape_src.GetDimension(),
-                              src_dims,
-                              format_m);
-        dnn_shape_diff_src.SetTfDimOrder(
-                              dnn_shape_src.GetDimension(),
-                              tensor_format_);
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                       format_m);
+        dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(),
+                                         tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
@@ -1287,33 +1232,22 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       prop_kind pk = prop_kind::backward;
       auto bnrm_bwd_desc = batch_normalization_backward::desc(
-                               pk,
-                               diff_src.GetUsrMemDesc(),
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               /* for inference, specify use_global_stats
-                                  1. on fwd prop, use mean and variance
-                                     provided as inputs
-                                  2. on bwd prop, mean and variance are
-                                     considered as constants. Thus, 
-                                     reduce the amout of MKL computations
-                               */
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_,
+          /* for inference, specify use_global_stats
+             1. on fwd prop, use mean and variance
+                provided as inputs
+             2. on bwd prop, mean and variance are
+                considered as constants. Thus,
+                reduce the amout of MKL computations
+          */
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
-                               bnrm_bwd_desc,
-                               cpu_engine,
-                               bnrm_fwd_pd);
+          bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd);
 
       auto bnrm_bwd_op = batch_normalization_backward(
-                               bnrm_bwd_pd,
-                               src.GetOpMem(),
-                               mean.GetOpMem(),
-                               variance.GetOpMem(),
-                               diff_dst.GetOpMem(),
-                               weights_m,
-                               diff_src.GetOpMem(),
-                               diff_weights_m);
+          bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(),
+          diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m);
 
       std::vector<primitive> net;
       net.push_back(bnrm_bwd_op);
@@ -1322,43 +1256,39 @@ class MklFusedBatchNormGradOp : public OpKernel {
       // allocate 4 output TF tensors
       Tensor* diff_scale_tensor = nullptr;
       Tensor* diff_shift_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(),
-                        &diff_scale_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &diff_scale_tensor,
                         &diff_shift_tensor);
 
       // copy data: diff_scale and diff_shift
-      T* diff_weights_data_dnn = reinterpret_cast<T*>
-                                 (diff_weights_m.get_data_handle());
+      T* diff_weights_data_dnn =
+          reinterpret_cast<T*>(diff_weights_m.get_data_handle());
       for (int i = 0; i < depth_; i++) {
-        diff_scale_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i];
+        diff_scale_tensor->flat<T>().data()[i] = diff_weights_data_dnn[i];
         diff_shift_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i + depth_];
+            diff_weights_data_dnn[i + depth_];
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
   T epsilon_;
   TensorFormat tensor_format_;
-  int depth_;             // batch normalization is done for per channel.
+  int depth_;  // batch normalization is done for per channel.
   bool is_training_;
 
   void ExtractParams(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
                         TensorShape tf_shape_scale_shift,
                         Tensor** diff_src_tensor) {
     const size_t kDiffSrcIndex = 0;
@@ -1366,22 +1296,20 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor,
-                                tf_shape_src, dnn_shape_diff_src);
-    for (size_t i=0; i < (*diff_src_tensor)->shape().num_elements(); i++)
-       (*diff_src_tensor)->flat<T>().data()[i] = 0;
+                              tf_shape_src, dnn_shape_diff_src);
+    for (size_t i = 0; i < (*diff_src_tensor)->shape().num_elements(); i++)
+      (*diff_src_tensor)->flat<T>().data()[i] = 0;
 
     Tensor* diff_scale_tensor = nullptr;
     Tensor* diff_shift_tensor = nullptr;
-    AllocateTFOutputs(context,
-                      tf_shape_scale_shift,
-                      &diff_scale_tensor,
+    AllocateTFOutputs(context, tf_shape_scale_shift, &diff_scale_tensor,
                       &diff_shift_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context,
-                        TensorShape tf_shape_scale_shift,
-                        Tensor** diff_scale_tensor,
-                        Tensor** diff_shift_tensor) {
+                         TensorShape tf_shape_scale_shift,
+                         Tensor** diff_scale_tensor,
+                         Tensor** diff_shift_tensor) {
     CHECK_NOTNULL(diff_scale_tensor);
     CHECK_NOTNULL(diff_shift_tensor);
 
@@ -1396,31 +1324,29 @@ class MklFusedBatchNormGradOp : public OpKernel {
     AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_scale);
     CHECK_NOTNULL(*diff_scale_tensor);
-    for (size_t i=0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
-       (*diff_scale_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
+      (*diff_scale_tensor)->flat<T>().data()[i] = 0;
 
     MklDnnShape mkl_shape_diff_shift;
     mkl_shape_diff_shift.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_shift);
     CHECK_NOTNULL(*diff_shift_tensor);
-    for (size_t i=0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
-       (*diff_shift_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
+      (*diff_shift_tensor)->flat<T>().data()[i] = 0;
 
     // Placeholders for estimated_mean and estimated_variance, which are
     // used for inference and thus not needed here for gradient computation.
-    Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+    Tensor *p1_tensor = nullptr, *p2_tensor = nullptr;
     MklDnnShape mkl_shape_p;
     mkl_shape_p.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor,
-                              TensorShape({}), mkl_shape_p);
-    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor,
-                              TensorShape({}), mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}),
+                              mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}),
+                              mkl_shape_p);
   }
 
-  memory::dims GetMeanVarianceDims() {
-    return memory::dims({1, depth_});
-  }
+  memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index 9ee27ee21c8d23c8ce314a7687ac9b79a1d9ea30..6c027f8e728b8660d18a70ae58995fa104f0b375 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,14 +28,14 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 #endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 4b5f7b831001458c222536be30bc40fcf5d2899a..acb0db57b38c08af345dc2b22a7822c0f0f202f0 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::stream;
@@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 //     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -271,8 +271,8 @@ class MklInputConversionOp : public OpKernel {
     MklDnnShape input_shape_1;
     GetMklShape(context, 1, &input_shape_1);
 
-    bool tf_shapes_are_same = context->input(0).shape() ==
-                              context->input(1).shape();
+    bool tf_shapes_are_same =
+        context->input(0).shape() == context->input(1).shape();
 
     VLOG(1) << "MklInputConversionOp: Input shapes are "
             << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
@@ -293,14 +293,56 @@ class MklInputConversionOp : public OpKernel {
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      // If both have the same shape, pass them through
       if (tf_shapes_are_same) {
-        VLOG(1) << "MklInputConversionOp: No conversion needed, "
-                << "copying MKL inputs with identical shapes to output";
-
-        ForwardMklTensorInToOut(context, 0, 0);
-        ForwardMklTensorInToOut(context, 1, 1);
-        return;
+        auto input0_md = input_shape_0.GetMklLayout();
+        auto input1_md = input_shape_1.GetMklLayout();
+        
+        // If both have the same shape and same format, pass them through
+        if ( input0_md.data.format == input1_md.data.format) {
+          VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                  << "copying MKL inputs with identical shapes to output";
+
+          ForwardMklTensorInToOut(context, 0, 0);
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;
+        } else {
+          VLOG(1) << "MklInputConversionOp: Shape is same, but format is different, "
+                  << "need to convert to same format";
+          
+          // Convert input0, and keep input1 unchanged
+          // Create MklDnnShape for output mkl tensor based on input0
+          Tensor* tensor_out;
+          MklDnnShape mkl_output_mkl_shape;
+          mkl_output_mkl_shape.SetMklTensor(true);
+          mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+          mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(),
+                                           input_shape_0.GetSizesAsMklDnnDims(),
+                                           input_shape_0.GetTfDataFormat());
+
+          // Get MKL layout from input1 as destination layout
+          mkl_output_mkl_shape.SetMklLayout(&input1_md);
+
+          // Create output Mkl tensor for index 0
+          AllocateOutputSetMklShape(context, 0, &tensor_out,
+                                    input_tensor_0.shape(), mkl_output_mkl_shape);
+
+          // Create MklDnnData object for input0 tesnsor
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> input(&cpu_engine);
+          input.SetUsrMem(input0_md, &input_tensor_0);
+
+          // Create reorder from input0's layout to input1's layout
+          std::vector<primitive> net;
+          CHECK_EQ(input.CheckReorderToOpMem(memory::primitive_desc(
+                                             input1_md, cpu_engine),
+                                             tensor_out, &net),
+                                             true);
+          stream(stream::kind::eager).submit(net).wait();
+
+          // Input1 will be passed through
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;         
+        }
       }
 
       // Sanity check
@@ -400,9 +442,9 @@ class MklInputConversionOp : public OpKernel {
 
       // Create reorder between tensorflow layout and Mkl layout.
       std::vector<primitive> net;
-      CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc(
-                                            output_mkl_md, cpu_engine),
-                                            tensor_out, &net),
+      CHECK_EQ(tf_input.CheckReorderToOpMem(
+                   memory::primitive_desc(output_mkl_md, cpu_engine),
+                   tensor_out, &net),
                true);
       stream(stream::kind::eager).submit(net).wait();
 
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 95e0404ba8ab7d305e530239be30c7a842edf16d..5f0a12a1fb9bff3086e05918e23b8396196eb389 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,20 +33,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-using mkldnn::lrn_forward;
+using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
+using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
-using mkldnn::lrn_across_channels;
 using mkldnn::stream;
 #endif
 
@@ -67,7 +67,7 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename T>
 class MklLRNOp : public OpKernel {
@@ -77,10 +77,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -103,9 +104,10 @@ class MklLRNOp : public OpKernel {
                               : input.dims();
     OP_REQUIRES(context, mkl_context.in_dims == 4,
                 errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
 
     if (!input_in_mkl_format) {
       mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
@@ -339,17 +341,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -740,10 +742,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -773,10 +776,10 @@ class MklLRNOp : public OpKernel {
       if (!src_dnn_shape.IsMklTensor()) {
         MklDefaultToEigen(context, src_tensor);
         return;
-      } else if (!src_dnn_shape.IsMklChannelDim(
-                  src_dnn_shape.GetDimension() - 1) ) {
+      } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() -
+                                                1)) {
         Tensor converted_tensor =
-          ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
+            ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
         MklDefaultToEigen(context, converted_tensor);
         return;
       }
@@ -807,18 +810,16 @@ class MklLRNOp : public OpKernel {
       // Create LRN primitive descriptor.
       // Tensorflow's normalization semantics is across channels.
       // MKL-DNN also supports normalization within channel.
-      auto lrn_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
+      auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels,
                                         src_dnn_data.GetUsrMemDesc(),
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
+                                        kernel_size, new_alpha, beta_, bias_);
       auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine);
 
       // Allocate output_dnn_data tensor.
       Tensor* output_tensor = nullptr;
       memory::format input_format = src_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_prim_desc, input_dims,
-                    input_format, &output_tensor);
+      AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format,
+                           &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       dst_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -827,25 +828,23 @@ class MklLRNOp : public OpKernel {
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
       OP_REQUIRES_OK(context, context->status());
 
-      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data,
-                          &dst_dnn_data, &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data,
+                           &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
-  void PrepareAndExecuteNet(
-                const lrn_forward::primitive_desc& lrn_fwd_desc,
-                MklDnnData<T>* src_dnn_data,
-                MklDnnData<T>* dst_dnn_data,
-                MklDnnData<uint8>* wksp_dnn_data = nullptr) {
+  void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc,
+                            MklDnnData<T>* src_dnn_data,
+                            MklDnnData<T>* dst_dnn_data,
+                            MklDnnData<uint8>* wksp_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reorder
@@ -853,23 +852,21 @@ class MklLRNOp : public OpKernel {
 
     // Create pooling primitive and add it to net
     if (wksp_dnn_data != nullptr) {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-                        src_dnn_data->GetOpMem(),
-                        wksp_dnn_data->GetOpMem(),
-                        dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                wksp_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     } else {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-            src_dnn_data->GetOpMem(),
-            dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc();
 
@@ -880,111 +877,106 @@ class MklLRNOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
     // only allocate enough space for the elements we need.
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
-
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
 
-      Tensor *output_dnn_data = nullptr;
-      MklDnnShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
-                                input.shape(), mkl_output_mkl_shape);
-      CHECK_NOTNULL(output_dnn_data);
-
-      Tensor* workspace_tensor = nullptr;
-      MklDnnShape workspace_mkl_shape;
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(0);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context, const Tensor& input) {
+    const int batch = static_cast<int>(input.dim_size(0));
+    const int rows = static_cast<int>(input.dim_size(1));
+    const int cols = static_cast<int>(input.dim_size(2));
+    const int depth = static_cast<int>(input.dim_size(3));
+    const int nodes = cols * rows;
+
+    auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+    // Multiplying the input with the band matrix has the effect of reducing
+    // the
+    // correct patch along the depth.
+    Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+    GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+
+    Tensor* output_dnn_data = nullptr;
+    MklDnnShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input.shape(), mkl_output_mkl_shape);
+    CHECK_NOTNULL(output_dnn_data);
+
+    Tensor* workspace_tensor = nullptr;
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(0);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
-
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
+    CHECK_NOTNULL(workspace_tensor);
+
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+    auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+    if (beta_ == T(1)) {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * tmp.inverse();
+    } else if (beta_ == T(0.5)) {
+      out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
+    } else {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * (tmp.log() * -beta_).exp();
     }
+  }
 
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-      CHECK_NOTNULL(dnn_data_wksp);
-      Tensor* workspace_tensor = nullptr;
-      memory::primitive_desc workspace_pd
-                  = lrn_fwd_prim_desc.workspace_primitive_desc();
-      size_t workspace_bytes = workspace_pd.get_size();
-      MklDnnShape workspace_mkl_shape;
-      // the workspace tensor is a uint8 tensor that has
-      // exactly the number of bytes necessary
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(workspace_bytes);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        lrn_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    // the workspace tensor is a uint8 tensor that has
+    // exactly the number of bytes necessary
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
-      dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& src_tensor = MklGetInput(context, kIdxInput);
     MklDnnShape src_dnn_shape;
     GetMklShape(context, kIdxInput, &src_dnn_shape);
     if (src_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     } else {
-        OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_tensor.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     }
   }
-  const int kIdxInput = 0,
-            kIdxOutput = 0,
-            kIdxWorkspace = 1;
+  const int kIdxInput = 0, kIdxOutput = 0, kIdxWorkspace = 1;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -994,17 +986,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -1025,7 +1017,7 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T> output_dnn_data(&cpu_engine);
 
       MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
+          orig_output_dnn_shape;
       GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
       GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
       GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
@@ -1037,16 +1029,16 @@ class MklLRNGradOp : public OpKernel {
                             orig_input_dnn_shape.IsMklTensor() &&
                             orig_output_dnn_shape.IsMklTensor() &&
                             input_grad_dnn_shape.IsMklChannelDim(
-                              input_grad_dnn_shape.GetDimension() - 1) &&
+                                input_grad_dnn_shape.GetDimension() - 1) &&
                             orig_input_dnn_shape.IsMklChannelDim(
-                              orig_input_dnn_shape.GetDimension() - 1) &&
+                                orig_input_dnn_shape.GetDimension() - 1) &&
                             orig_output_dnn_shape.IsMklChannelDim(
-                              orig_output_dnn_shape.GetDimension() - 1);
+                                orig_output_dnn_shape.GetDimension() - 1);
 
       if (!can_use_mkldnn) {
-          // Fallback to eigen
-          MklDefaultToEigen(context);
-          return;
+        // Fallback to eigen
+        MklDefaultToEigen(context);
+        return;
       }
       // At this point, we have the all clear to use MklDnn constructs
       // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor.
@@ -1059,13 +1051,11 @@ class MklLRNGradOp : public OpKernel {
       // NHWC format.
       memory::desc original_output_md = orig_output_dnn_shape.GetCurLayout();
       memory::desc target_diff_dst_md = ConfigureInputGradient(
-                                input_grad_tensor,
-                                input_grad_dnn_shape,
-                                &input_grad_dnn_data);
+          input_grad_tensor, input_grad_dnn_shape, &input_grad_dnn_data);
 
       memory::desc orig_input_md = orig_input_dnn_shape.GetCurLayout();
       memory::dims orig_input_dims =
-                orig_input_dnn_shape.GetSizesAsMklDnnDims();
+          orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
       orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
 
@@ -1079,27 +1069,21 @@ class MklLRNGradOp : public OpKernel {
 
       // Create LRN backward primitive descriptor. It requires LRN forward
       // primitive descriptor also.
-      auto lrn_fwd_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
-                                        orig_input_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_fwd_prim_desc = lrn_forward::primitive_desc(lrn_fwd_desc,
-                                                           cpu_engine);
-      auto lrn_bwd_desc = lrn_backward::desc(lrn_across_channels,
-                                        original_output_md,
-                                        target_diff_dst_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(lrn_bwd_desc,
-                                                          cpu_engine,
-                                                          lrn_fwd_prim_desc);
+      auto lrn_fwd_desc = lrn_forward::desc(
+          prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size,
+          new_alpha, beta_, bias_);
+      auto lrn_fwd_prim_desc =
+          lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine);
+      auto lrn_bwd_desc = lrn_backward::desc(
+          lrn_across_channels, original_output_md, target_diff_dst_md,
+          kernel_size, new_alpha, beta_, bias_);
+      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
+          lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc);
 
       Tensor* output_tensor = nullptr;
-      memory::format orig_input_format
-                    = orig_input_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_bwd_prim_desc,
-            orig_input_dims, orig_input_format, &output_tensor);
+      memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat();
+      AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims,
+                           orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       output_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -1110,35 +1094,32 @@ class MklLRNGradOp : public OpKernel {
       const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
       MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
       ConfigureWorkspace(workspace_tensor,
-                          lrn_fwd_prim_desc.workspace_primitive_desc(),
-                          &workspace_dnn_data);
-
-      PrepareAndExecuteNet(lrn_bwd_prim_desc,
-                lrn_fwd_prim_desc,
-                &orig_input_dnn_data,
-                &input_grad_dnn_data,
-                &output_dnn_data,
-                memory::primitive_desc(target_diff_dst_md, cpu_engine),
-                &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+                         lrn_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+
+      PrepareAndExecuteNet(
+          lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data,
+          &input_grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = lrn_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        lrn_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
 
     // We assume that all outputs at this point are MKL Tensors
@@ -1146,170 +1127,153 @@ class MklLRNGradOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor,
-                    const MklDnnShape& input_grad_dnn_shape,
-                    MklDnnData<T> *input_grad_dnn_data) {
+                                      const MklDnnShape& input_grad_dnn_shape,
+                                      MklDnnData<T>* input_grad_dnn_data) {
     CHECK_NOTNULL(input_grad_dnn_data);
     // This shouldn't be necessary at this point, but just in case
     CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true);
 
     memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
-    memory::dims orig_input_dims =
-              input_grad_dnn_shape.GetSizesAsMklDnnDims();
+    memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
     input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
     input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc);
     return input_grad_md;
   }
 
   void PrepareAndExecuteNet(
-        const lrn_backward::primitive_desc& lrn_bkwd_desc,
-        const lrn_forward::primitive_desc& lrn_fwd_desc,
-        MklDnnData<T>* src_dnn_data,
-        MklDnnData<T>* input_gradient_diff_dst,
-        MklDnnData<T>* output_diff_src,
-        const memory::primitive_desc& target_diff_dst_pd,
-        const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
+      const lrn_backward::primitive_desc& lrn_bkwd_desc,
+      const lrn_forward::primitive_desc& lrn_fwd_desc,
+      MklDnnData<T>* src_dnn_data, MklDnnData<T>* input_gradient_diff_dst,
+      MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
-                lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
+        lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
 
     // Check for input reordering on the original input
-    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(),
-                                      &net);
+    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net);
     // Create pooling primitive and add it to net
     if (nullptr == workspace_dnn_data) {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                              src_dnn_data->GetOpMem(),
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     } else {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                                src_dnn_data->GetOpMem(),
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace_dnn_data->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 workspace_dnn_data->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
   void ConfigureWorkspace(const Tensor& workspace_tensor,
-                    memory::primitive_desc workspace_pd,
-                    MklDnnData<uint8> *workspace_dnn_data) {
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
     CHECK_NOTNULL(workspace_dnn_data);
 
     workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
   }
 
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor input_gradient_tensor;
-      Tensor orig_input_tensor;
-      Tensor orig_output_tensor;
-
-      MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
-      GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
-      GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
-      GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
-
-      if (input_grad_dnn_shape.IsMklTensor()) {
-        input_gradient_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxGradient),
-                              input_grad_dnn_shape);
-      } else {
-        input_gradient_tensor = MklGetInput(context, kIdxGradient);
-      }
-
-      if (orig_input_dnn_shape.IsMklTensor()) {
-        orig_input_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigInput),
-                              orig_input_dnn_shape);
-      } else {
-        orig_input_tensor = MklGetInput(context, kIdxOrigInput);
-      }
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context) {
+    Tensor input_gradient_tensor;
+    Tensor orig_input_tensor;
+    Tensor orig_output_tensor;
+
+    MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
+        orig_output_dnn_shape;
+    GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
+    GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
+    GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
+
+    if (input_grad_dnn_shape.IsMklTensor()) {
+      input_gradient_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape);
+    } else {
+      input_gradient_tensor = MklGetInput(context, kIdxGradient);
+    }
 
-      if (orig_output_dnn_shape.IsMklTensor()) {
-        orig_output_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigOutput),
-                              orig_output_dnn_shape);
-      } else {
-        orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
-      }
+    if (orig_input_dnn_shape.IsMklTensor()) {
+      orig_input_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape);
+    } else {
+      orig_input_tensor = MklGetInput(context, kIdxOrigInput);
+    }
 
-      const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
-      const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
-      const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
-      const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
-      const auto nodes = cols * rows;
+    if (orig_output_dnn_shape.IsMklTensor()) {
+      orig_output_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape);
+    } else {
+      orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
+    }
 
-      auto grads_shaped =
-          input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
+    const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
+    const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
+    const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
+    const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
+    const auto nodes = cols * rows;
 
-      auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
-      auto activations =
-              orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto grads_shaped =
+        input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      Tensor* output_dnn_data;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput,
-                      &output_dnn_data,
-                      input_gradient_tensor.shape(),
-                      mkl_output_mkl_shape);
+    auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto activations = orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+    Tensor* output_dnn_data;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input_gradient_tensor.shape(),
+                              mkl_output_mkl_shape);
 
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    out_shaped.setZero();
+    auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                  depth](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        for (int64 j = 0; j < depth; ++j) {
+          int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+          int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+          T norm(0);
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            norm += in_shaped(i, k) * in_shaped(i, k);
+          }
+          norm = alpha_ * norm + bias_;
+          DCHECK_GT(norm, T(1e-6));
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+                    activations(i, j) / norm;
+            if (k == j) {
+              dyi += Eigen::numext::pow(norm, -beta_);
             }
+            dyi *= grads_shaped(i, j);
+            const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi;
           }
         }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+          depth * depth, shard);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& input_gradient_tensor = MklGetInput(context, kIdxGradient);
@@ -1317,59 +1281,59 @@ class MklLRNGradOp : public OpKernel {
     const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
     const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
     MklDnnShape in_grads_dnn_shape, in_image_dnn_shape, out_image_dnn_shape,
-                workspace_dnn_shape;
+        workspace_dnn_shape;
     GetMklShape(context, kIdxGradient, &in_grads_dnn_shape);
     GetMklShape(context, kIdxOrigInput, &in_image_dnn_shape);
     GetMklShape(context, kIdxOrigOutput, &out_image_dnn_shape);
     GetMklShape(context, kIdxWorkspace, &workspace_dnn_shape);
     if (in_grads_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_grads_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input gradient must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Input gradient must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-            errors::InvalidArgument("input gradient must be 4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
     }
 
     if (in_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("input images must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("input images must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, orig_input_tensor.dims() == 4,
                   errors::InvalidArgument("input images must be "
-                  "4-dimensional"));
+                                          "4-dimensional"));
     }
 
     if (out_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, out_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Output image must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Output image must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-            errors::InvalidArgument("Output image must be 4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
     }
 
     if (workspace_enabled_) {
       if (workspace_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false,
-              errors::InvalidArgument("Workspace should not be MKL Tensor."));
+        OP_REQUIRES(
+            context, workspace_dnn_shape.IsMklTensor() == false,
+            errors::InvalidArgument("Workspace should not be MKL Tensor."));
       } else {
         OP_REQUIRES(context, workspace_tensor.dims() == 1,
-              errors::InvalidArgument("Workspace must be 1-dimensional"));
+                    errors::InvalidArgument("Workspace must be 1-dimensional"));
       }
     }
   }
 
-// Input("input_grads: T")
-// Input("input_image: T")
-// Input("output_image: T")
-// Input("workspace: uint8")
-  const int kIdxGradient = 0,
-            kIdxOrigInput = 1,
-            kIdxOrigOutput = 2,
-            kIdxWorkspace = 3,
-            kIdxOutput = 0;
+  // Input("input_grads: T")
+  // Input("input_image: T")
+  // Input("output_image: T")
+  // Input("workspace: uint8")
+  const int kIdxGradient = 0, kIdxOrigInput = 1, kIdxOrigOutput = 2,
+            kIdxWorkspace = 3, kIdxOutput = 0;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -1379,7 +1343,7 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
@@ -1393,7 +1357,6 @@ class MklLRNGradOp : public OpKernel {
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklLRNGradOp<T>);
 
-
 TF_CALL_float(REGISTER_MKL_LRN_CPU);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 82c5229bab0cfef51799d521d6ced6fab804176c..14607f26e0ccd1028dd62343000d90ac8451d7bb 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -22,25 +22,25 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include <algorithm>
 #include "mkldnn.hpp"
-using mkldnn::memory;
+using mkldnn::algorithm;
+using mkldnn::engine;
 using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
+using mkldnn::memory;
 using mkldnn::padding_kind;
-using mkldnn::engine;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-using mkldnn::algorithm;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
@@ -397,18 +397,19 @@ class MklMaxPoolingGradOp : public OpKernel {
       if (workspace_enabled == false) {
         if (convert_input != nullptr) {
           if (input_in_mkl_format == false) {
-            CHECK_EQ(
-                dnnConversionExecute_F32(
-                    convert_input, const_cast<void*>(static_cast<const void*>(
-                                       tensor_in.flat<T>().data())),
-                    input_buf),
-                E_SUCCESS);
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
             CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
             convert_input = nullptr;
           } else {
             input_shape.GetConvertedFlatData(
-                lt_input_prim, const_cast<void*>(static_cast<const void*>(
-                                   tensor_in.flat<T>().data())),
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
                 input_buf);
           }
           pooling_resfwd[dnnResourceSrc] = input_buf;
@@ -453,8 +454,9 @@ class MklMaxPoolingGradOp : public OpKernel {
           CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
         } else {
           output_backprop_shape.GetConvertedFlatData(
-              lt_outbackprop_prim, const_cast<void*>(static_cast<const void*>(
-                                       out_backprop.flat<T>().data())),
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
               outbackprop_buf);
         }
         pooling_res[dnnResourceDiffDst] = outbackprop_buf;
@@ -492,14 +494,14 @@ class MklMaxPoolingGradOp : public OpKernel {
   bool workspace_enabled_;
 };  // MklMaxPoolingGradOp
 
-#else  // INTEL_MKL_DNN is defined
+#else
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
 class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklMaxPoolingOp(OpKernelConstruction* context)
-            : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
     // So we set workspace_enabled_ to true.
     this->workspace_enabled_ = true;
@@ -508,8 +510,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-                this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -522,9 +524,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                        input_tensor, &pool_params,
-                        &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -535,9 +536,10 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to MaxPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                                  static_cast<memory::format>(
-              dnn_data_input.GetUsrMemDesc().data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
       } else {
         dnn_data_output.SetUsrMem(output_dims_mkl_order,
                                   this->data_format_mkldnn_);
@@ -546,24 +548,21 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // describe the memory layout; let mkl-dnn choose the best for the op
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-            algorithm::pooling_max,
-            dnn_data_input.GetUsrMemDesc(),
-            dnn_data_output.GetUsrMemDesc(),
-            memory::dims({  pool_params.row_stride,
-                            pool_params.col_stride}),
-            memory::dims({  pool_params.window_rows,
-                            pool_params.window_cols}),
-            memory::dims({  static_cast<int>(pool_params.pad_top),
-                            static_cast<int>(pool_params.pad_left)}),
-            memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                            static_cast<int>(pool_params.pad_right)}),
-            TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc,
-            cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
@@ -571,39 +570,38 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       OP_REQUIRES_OK(context, context->status());
 
       this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
-                        &dnn_data_output, &dnn_data_wksp);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+                                 &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    const int kOutputTensorIndexWorkspace = 1;
-
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-        CHECK_NOTNULL(dnn_data_wksp);
-        Tensor* workspace_tensor = nullptr;
-        memory::primitive_desc workspace_pd
-                    = pool_fwd_prim_desc.workspace_primitive_desc();
-        size_t workspace_bytes = workspace_pd.get_size();
-        MklDnnShape workspace_mkl_shape;
-        workspace_mkl_shape.SetMklTensor(false);
-        TensorShape workspace_tf_shape;
-        workspace_tf_shape.AddDim(workspace_bytes);
-        AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
-                                &workspace_tensor,
-                                workspace_tf_shape, workspace_mkl_shape);
-        CHECK_NOTNULL(workspace_tensor);
-        dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+  const int kOutputTensorIndexWorkspace = 1;
+
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        pool_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                              &workspace_tensor, workspace_tf_shape,
+                              workspace_mkl_shape);
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 };
 
 // The operation to compute MaxPool gradients.
@@ -616,221 +614,186 @@ template <class Device, class T>
 class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
-        auto cpu_engine = engine(engine::cpu, 0);
-        const Tensor& orig_input_tensor = MklGetInput(context,
-            kInputTensorIndexOrigInput);
-        const Tensor& orig_output_tensor = MklGetInput(context,
-            kInputTensorIndexOrigOutput);
-        const Tensor& grad_tensor = MklGetInput(context,
-            kInputTensorIndexGradient);
-        const Tensor& workspace_tensor = MklGetInput(context,
-            kInputTensorIndexWorkspace);
-        MklDnnShape orig_input_mkl_shape,
-                    orig_output_mkl_shape,
-                    grad_mkl_shape,
-                    workspace_mkl_shape;
-        GetMklShape(context, kInputTensorIndexOrigInput,
-            &orig_input_mkl_shape);
-        GetMklShape(context, kInputTensorIndexOrigOutput,
-            &orig_output_mkl_shape);
-        GetMklShape(context, kInputTensorIndexGradient,
-            &grad_mkl_shape);
-        GetMklShape(context, kInputTensorIndexWorkspace,
-            &workspace_mkl_shape);
-
-        SanityCheckInputs(context,
-                            orig_input_tensor, orig_output_tensor,
-                            grad_tensor, workspace_tensor,
-                            orig_input_mkl_shape, orig_output_mkl_shape,
-                            grad_mkl_shape, workspace_mkl_shape);
-        if (!context->status().ok()) return;
-
-        MklDnnData<T> grad_dnn_data(&cpu_engine);
-        MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
-        MklDnnData<T> output_dnn_data(&cpu_engine);
-        Tensor* output_tensor = nullptr;
-        MklPoolParameters pool_params;
-        TensorShape orig_input_shape;
-        memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
-        memory::desc original_input_md = ConfigureOriginalInput(context,
-                                orig_input_tensor,
-                                orig_input_mkl_shape,
-                                &orig_input_dims_mkl_order,
-                                &pool_params,
-                                &orig_input_shape);
-
-        memory::desc original_output_md = this->ConfigureOriginalOutput(
-                                pool_params,
-                                orig_output_mkl_shape,
-                                output_dims_mkl_order);
-
-        memory::desc target_diff_dst_md =  this->ConfigureInputGradient(
-                                        grad_mkl_shape,
-                                        grad_tensor,
-                                        &grad_dnn_data,
-                                        original_output_md);
-
-        output_dnn_data.SetUsrMem(original_input_md);
-
-        // Create the forward pooling primitive descriptor so we can
-        // pass it as a hint to the backward pooling primitive descriptor
-        auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-                algorithm::pooling_max,
-                original_input_md,
-                original_output_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_prim_desc
-                = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                    cpu_engine);
-
-        auto pool_bkwd_desc = pooling_backward::desc(
-                algorithm::pooling_max,
-                output_dnn_data.GetUsrMemDesc(),
-                target_diff_dst_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_bkwd_prim_desc
-            = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                                cpu_engine,
-                                                pool_fwd_prim_desc);
-
-        this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-            orig_input_dims_mkl_order,
-            this->data_format_mkldnn_,
-            &output_tensor);
-        output_dnn_data.SetUsrMemDataHandle(output_tensor);
-
-        ConfigureWorkspace(workspace_tensor,
-                pool_fwd_prim_desc.workspace_primitive_desc(),
-                &workspace_dnn_data);
-        this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                            &grad_dnn_data,
-                            &output_dnn_data,
-                            memory::primitive_desc(
-                                target_diff_dst_md,
-                                cpu_engine),
-                            &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& orig_input_tensor =
+          MklGetInput(context, kInputTensorIndexOrigInput);
+      const Tensor& orig_output_tensor =
+          MklGetInput(context, kInputTensorIndexOrigOutput);
+      const Tensor& grad_tensor =
+          MklGetInput(context, kInputTensorIndexGradient);
+      const Tensor& workspace_tensor =
+          MklGetInput(context, kInputTensorIndexWorkspace);
+      MklDnnShape orig_input_mkl_shape, orig_output_mkl_shape, grad_mkl_shape,
+          workspace_mkl_shape;
+      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexOrigOutput, &orig_output_mkl_shape);
+      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexWorkspace, &workspace_mkl_shape);
+
+      SanityCheckInputs(context, orig_input_tensor, orig_output_tensor,
+                        grad_tensor, workspace_tensor, orig_input_mkl_shape,
+                        orig_output_mkl_shape, grad_mkl_shape,
+                        workspace_mkl_shape);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> grad_dnn_data(&cpu_engine);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      MklDnnData<T> output_dnn_data(&cpu_engine);
+      Tensor* output_tensor = nullptr;
+      MklPoolParameters pool_params;
+      TensorShape orig_input_shape;
+      memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, orig_input_tensor, orig_input_mkl_shape,
+          &orig_input_dims_mkl_order, &pool_params, &orig_input_shape);
+
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+          pool_params, orig_output_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+          grad_mkl_shape, grad_tensor, &grad_dnn_data, original_output_md);
+
+      output_dnn_data.SetUsrMem(original_input_md);
+
+      // Create the forward pooling primitive descriptor so we can
+      // pass it as a hint to the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max, original_input_md,
+          original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+          algorithm::pooling_max, output_dnn_data.GetUsrMemDesc(),
+          target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                                 orig_input_dims_mkl_order,
+                                 this->data_format_mkldnn_, &output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+      ConfigureWorkspace(workspace_tensor,
+                         pool_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    // .Input("orig_input: T")
-    // .Input("orig_output: T")
-    // .Input("grad: T")
-    // .Input("workspace: T")
-    const int kInputTensorIndexOrigInput = 0;
-    const int kInputTensorIndexOrigOutput = 1;
-    const int kInputTensorIndexGradient = 2;
-    const int kInputTensorIndexWorkspace = 3;
-    //  Output("output: T") in Base Class
-
-    memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                                const Tensor& tensor_original_input,
-                                const MklDnnShape& original_input_mkl_shape,
-                                memory::dims* original_input_dims_mkl_order,
-                                MklPoolParameters* pool_params,
-                                TensorShape* input_tensor_shape) {
-        *input_tensor_shape = tensor_original_input.shape();
-        return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                        context,
-                                        tensor_original_input,
-                                        original_input_mkl_shape,
-                                        original_input_dims_mkl_order,
-                                        pool_params,
-                                        *input_tensor_shape);
-    }
+  // .Input("orig_input: T")
+  // .Input("orig_output: T")
+  // .Input("grad: T")
+  // .Input("workspace: T")
+  const int kInputTensorIndexOrigInput = 0;
+  const int kInputTensorIndexOrigOutput = 1;
+  const int kInputTensorIndexGradient = 2;
+  const int kInputTensorIndexWorkspace = 3;
+  //  Output("output: T") in Base Class
+
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
+    *input_tensor_shape = tensor_original_input.shape();
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+        context, tensor_original_input, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
 
-    void ConfigureWorkspace(const Tensor& workspace_tensor,
-                        memory::primitive_desc workspace_pd,
-                        MklDnnData<uint8> *workspace_dnn_data) {
-        CHECK_NOTNULL(workspace_dnn_data);
+  void ConfigureWorkspace(const Tensor& workspace_tensor,
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
+    CHECK_NOTNULL(workspace_dnn_data);
 
-        workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
-    }
+    workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+  }
 
-    void SanityCheckInputs(OpKernelContext* context,
-                            const Tensor& orig_input_tensor,
-                            const Tensor& orig_output_tensor,
-                            const Tensor& grad_tensor,
-                            const Tensor& workspace_tensor,
-                            const MklDnnShape& orig_input_mkl_shape,
-                            const MklDnnShape& orig_output_mkl_shape,
-                            const MklDnnShape& grad_mkl_shape,
-                            const MklDnnShape& workspace_mkl_shape) {
-        if (!orig_input_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_input_tensor.dims() == 4,
-                errors::InvalidArgument("Original input shape must be "
-                "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original input shape must be "
-                    "4-dimensional"));
-        }
-        if (!orig_output_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-                errors::InvalidArgument("Original output must be "
-                        "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original output must be "
-                    "4-dimensional"));
-        }
-        if (!grad_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, grad_tensor.dims() == 4,
-                errors::InvalidArgument("Gradient must be 4-dimensional"));
-        } else {
-            OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Gradient must be "
-                    "4-dimensional"));
-        }
-        if (this->workspace_enabled_) {
-            // The workspace should not be an MKL tensor
-            OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
-                    errors::InvalidArgument("Workspace tensor should not"
-                                            " be an MKL Tensor."));
-            // It should only have one dimension
-            OP_REQUIRES(context, workspace_tensor.dims() == 1,
-                    errors::InvalidArgument("Workspace tensor must be "
-                                "1-dimensional"));
-        } else {
-            OP_REQUIRES(context, this->workspace_enabled_,
-                    errors::Unimplemented("MKL-DNN Max Pooling does not "
+  void SanityCheckInputs(OpKernelContext* context,
+                         const Tensor& orig_input_tensor,
+                         const Tensor& orig_output_tensor,
+                         const Tensor& grad_tensor,
+                         const Tensor& workspace_tensor,
+                         const MklDnnShape& orig_input_mkl_shape,
+                         const MklDnnShape& orig_output_mkl_shape,
+                         const MklDnnShape& grad_mkl_shape,
+                         const MklDnnShape& workspace_mkl_shape) {
+    if (!orig_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    }
+    if (!orig_output_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    }
+    if (!grad_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, grad_tensor.dims() == 4,
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Gradient must be "
+                                          "4-dimensional"));
+    }
+    if (this->workspace_enabled_) {
+      // The workspace should not be an MKL tensor
+      OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                  errors::InvalidArgument("Workspace tensor should not"
+                                          " be an MKL Tensor."));
+      // It should only have one dimension
+      OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                  errors::InvalidArgument("Workspace tensor must be "
+                                          "1-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, this->workspace_enabled_,
+          errors::Unimplemented("MKL-DNN Max Pooling does not "
                                 "yet support the use case "
                                 "where MaxPoolGrad is called without first"
                                 " calling MaxPool."));
-        }
     }
+  }
 };  // MklMaxPoolingGradOp
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index f7cadffd39c11bdedaca6a07e48f222e7ac5e0cb..5ef6ce2a5789034b338fe7308a6eca02f135befa 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include <vector>
-#include <limits>
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include <limits>
+#include <vector>
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -42,7 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -72,7 +72,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -107,21 +107,21 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the height or width are greater than MAX_INT
 
-    OP_REQUIRES(context, FastBoundsCheck(out_height,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_height, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output height is too large"));
 
-    OP_REQUIRES(context, FastBoundsCheck(out_width,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_width, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output width is too large"));
 
 #endif
     out_depth = depth;  // output will have the same depth as the input
-  } else {  // we are pooling in the depth dimension
+  } else {              // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index b974b2c59afe91b955af45f3851c7371d8a86610..279167aba24863441774b0665e9793e52d84ccfa 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -17,16 +17,16 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
-#include <vector>
 #include <string>
+#include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::memory;
-using mkldnn::pooling_forward;
 using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::stream;
 #endif
 
@@ -61,19 +61,31 @@ struct MklPoolParameters {
 
   TensorFormat data_format;
   MklPoolParameters()
-    : depth(0)
-    , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0)
-    , window_rows(0), window_cols(0), depth_window(0)
-    , row_stride(0), col_stride(0), depth_stride(0)
-    , out_height(0), out_width(0), out_depth(0)
-    , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0)
-    , data_format(TensorFormat::FORMAT_NCHW) {}
+      : depth(0),
+        tensor_in_cols(0),
+        tensor_in_rows(0),
+        tensor_in_batch(0),
+        window_rows(0),
+        window_cols(0),
+        depth_window(0),
+        row_stride(0),
+        col_stride(0),
+        depth_stride(0),
+        out_height(0),
+        out_width(0),
+        out_depth(0),
+        pad_left(0),
+        pad_right(0),
+        pad_top(0),
+        pad_bottom(0),
+        pad_depth(0),
+        data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
@@ -90,39 +102,37 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <class T>
 class MklPoolingOpBase : public OpKernel {
  public:
   explicit MklPoolingOpBase(OpKernelConstruction* context)
-            : OpKernel(context)
-            , workspace_enabled_(false) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context,
-            FormatFromString(data_format, &this->data_format_tf_),
-            errors::InvalidArgument("Invalid data format"));
-      this->data_format_mkldnn_
-                = TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
-      OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
-      OP_REQUIRES(context, this->ksize_.size() == 4,
-                  errors::InvalidArgument("Sliding window ksize field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
-      OP_REQUIRES(context, this->stride_.size() == 4,
-                  errors::InvalidArgument("Sliding window strides field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
-      OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
-                  errors::Unimplemented("Pooling is not yet supported on the "
-                                        "batch dimension."));
-
-      // We may not get this attribute for this node if it does not go through
-      // graph rewrite pass. So we do not check for error while retrieving this
-      // attribute value.
-      context->GetAttr("workspace_enabled", &this->workspace_enabled_);
-    }
+      : OpKernel(context), workspace_enabled_(false) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
+                errors::InvalidArgument("Invalid data format"));
+    this->data_format_mkldnn_ =
+        TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+    OP_REQUIRES(context, this->ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+    OP_REQUIRES(context, this->stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+    OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+  }
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
@@ -132,24 +142,24 @@ class MklPoolingOpBase : public OpKernel {
   // output height and output width to have already been int32
   // bounds-checked
   void GetOutputDims(const MklPoolParameters& mkl_pool_params,
-                    memory::dims* output_dims_mkl_order) {
+                     memory::dims* output_dims_mkl_order) {
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch,
+    *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch,
                               mkl_pool_params.out_depth,
                               static_cast<int>(mkl_pool_params.out_height),
                               static_cast<int>(mkl_pool_params.out_width)};
   }
 
   void InitMklPoolParameters(OpKernelContext* context,
-                      MklPoolParameters* pool_params,
-                      const MklDnnShape& original_input_mkl_shape,
-                      const TensorShape& input_tensor_shape) {
+                             MklPoolParameters* pool_params,
+                             const MklDnnShape& original_input_mkl_shape,
+                             const TensorShape& input_tensor_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, input_tensor_shape);
+                        this->data_format_tf_, input_tensor_shape);
     } else {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, &original_input_mkl_shape);
+                        this->data_format_tf_, &original_input_mkl_shape);
     }
   }
 
@@ -159,13 +169,12 @@ class MklPoolingOpBase : public OpKernel {
   size_t GetNumTElements(const memory::primitive_desc& pd) {
     size_t num_bytes = pd.get_size();
     size_t ret_val = num_bytes / sizeof(T);
-    if ( num_bytes % sizeof(T) != 0 ) {
-        ret_val++;
+    if (num_bytes % sizeof(T) != 0) {
+      ret_val++;
     }
     return ret_val;
   }
 
-
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
@@ -183,30 +192,29 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
 
  protected:
   void ConfigureInput(OpKernelContext* context,
-                    const MklDnnShape& input_mkl_shape,
-                    const Tensor& input_tensor,
-                    MklPoolParameters* pool_params,
-                    MklDnnData<T>* dnn_data_input) {
+                      const MklDnnShape& input_mkl_shape,
+                      const Tensor& input_tensor,
+                      MklPoolParameters* pool_params,
+                      MklDnnData<T>* dnn_data_input) {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md = input_mkl_shape.IsMklTensor()
-                        ? input_mkl_shape.GetMklLayout()
-                        : memory::desc(
-                              TFShapeToMklDnnDimsInNCHW(
-                                  input_tensor_shape, this->data_format_tf_),
-                              MklDnnType<T>(),
-                              this->data_format_mkldnn_);
+    memory::desc input_md =
+        input_mkl_shape.IsMklTensor()
+            ? input_mkl_shape.GetMklLayout()
+            : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                                     this->data_format_tf_),
+                           MklDnnType<T>(), this->data_format_mkldnn_);
     dnn_data_input->SetUsrMem(input_md, &input_tensor);
-    this->InitMklPoolParameters(context, pool_params,
-                      input_mkl_shape, input_tensor_shape);
+    this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
+                                input_tensor_shape);
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
 
@@ -215,50 +223,42 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
 
     // only allocate enough space for the elements we need.
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-                  const pooling_forward::primitive_desc& pool_fwd_desc,
-                  const MklDnnData<T>* src,
-                  MklDnnData<T>* dst,
-                  MklDnnData<uint8>* wksp = nullptr) {
+      const pooling_forward::primitive_desc& pool_fwd_desc,
+      const MklDnnData<T>* src, MklDnnData<T>* dst,
+      MklDnnData<uint8>* wksp = nullptr) {
     std::vector<primitive> net;
 
     // Create pooling primitive and add it to net
     if (wksp != nullptr) {
-        net.push_back(pooling_forward(pool_fwd_desc,
-                        src->GetOpMem(),
-                        dst->GetOpMem(),
-                        wksp->GetOpMem()));
+      net.push_back(pooling_forward(pool_fwd_desc, src->GetOpMem(),
+                                    dst->GetOpMem(), wksp->GetOpMem()));
     } else {
-        net.push_back(pooling_forward(pool_fwd_desc,
-            src->GetOpMem(),
-            dst->GetOpMem()));
+      net.push_back(
+          pooling_forward(pool_fwd_desc, src->GetOpMem(), dst->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-
-  void SanityCheckInput(OpKernelContext* context,
-                  const Tensor& input_tensor,
-                  const MklDnnShape& input_mkl_shape) {
+  void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor,
+                        const MklDnnShape& input_mkl_shape) {
     if (!input_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, input_tensor.dims() == 4,
-          errors::InvalidArgument("Input must be 4-dimensional"));
+                  errors::InvalidArgument("Input must be 4-dimensional"));
     } else {
-        OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input shape must be "
-                "4-dimensional"));
+      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Input shape must be "
+                                          "4-dimensional"));
     }
   }
   // .Input("value: T")
@@ -267,66 +267,58 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
   const int kOutputTensorIndexOutput = 0;
 };  // MklPoolingForwardBaseOp
 
-
 template <class T>
 class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
  public:
   explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
-          : MklPoolingOpBase<T>(context) { }
+      : MklPoolingOpBase<T>(context) {}
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
   const int kOutputTensorIndexOutput = 0;
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = pool_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        pool_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-    const pooling_backward::primitive_desc& pool_bkwd_desc,
-    MklDnnData<T>* input_gradient_diff_dst,
-    MklDnnData<T>* output_diff_src,
-    const memory::primitive_desc& target_diff_dst_pd,
-    const MklDnnData<uint8>* workspace = nullptr) {
-
+      const pooling_backward::primitive_desc& pool_bkwd_desc,
+      MklDnnData<T>* input_gradient_diff_dst, MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace = nullptr) {
     std::vector<primitive> net;
 
     // If the input gradient isn't in the same format as the output
     // reorder it to the same format as the output
-    input_gradient_diff_dst->CheckReorderToOpMem(
-            target_diff_dst_pd,
-            &net);
+    input_gradient_diff_dst->CheckReorderToOpMem(target_diff_dst_pd, &net);
 
     // Create pooling primitive and add it to net
     if (nullptr == workspace) {
       net.push_back(pooling_backward(pool_bkwd_desc,
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+                                     input_gradient_diff_dst->GetOpMem(),
+                                     output_diff_src->GetOpMem()));
     } else {
-      net.push_back(pooling_backward(pool_bkwd_desc,
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(
+          pooling_backward(pool_bkwd_desc, input_gradient_diff_dst->GetOpMem(),
+                           workspace->GetOpMem(), output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
@@ -334,80 +326,76 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
   // Max Pooling and Avg Pooling have slightly different implementations
   // Takes the Tensor containing original input data and the original
   // mkl Dnn Shape and populates other data
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                              const Tensor& tensor_original_input_shape,
-                              const MklDnnShape& original_input_mkl_shape,
-                              memory::dims* original_input_dims_nchw,
-                              MklPoolParameters* pool_params,
-                              const TensorShape& input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_nchw, MklPoolParameters* pool_params,
+      const TensorShape& input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_nchw);
     CHECK_NOTNULL(pool_params);
-    this->InitMklPoolParameters(context, pool_params,
-                          original_input_mkl_shape,
-                          input_tensor_shape);
-
-    *original_input_dims_nchw
-          = original_input_mkl_shape.IsMklTensor()
-          ? original_input_mkl_shape.GetSizesAsMklDnnDims()
-          : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
-        this->data_format_tf_);
-
-    return  original_input_mkl_shape.IsMklTensor()
-      ? original_input_mkl_shape.GetMklLayout()
-      : memory::desc(*original_input_dims_nchw,
-                      MklDnnType<T>(),
-                      this->data_format_mkldnn_);
+    this->InitMklPoolParameters(context, pool_params, original_input_mkl_shape,
+                                input_tensor_shape);
+
+    *original_input_dims_nchw =
+        original_input_mkl_shape.IsMklTensor()
+            ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                        this->data_format_tf_);
+
+    return original_input_mkl_shape.IsMklTensor()
+               ? original_input_mkl_shape.GetMklLayout()
+               : memory::desc(*original_input_dims_nchw, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
-  memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params,
-                                const MklDnnShape& original_output_mkl_shape,
-                                      memory::dims output_dims_mkl_order) {
+  memory::desc ConfigureOriginalOutput(
+      const MklPoolParameters& pool_params,
+      const MklDnnShape& original_output_mkl_shape,
+      memory::dims output_dims_mkl_order) {
     this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
     return original_output_mkl_shape.IsMklTensor()
-            ? original_output_mkl_shape.GetMklLayout()
-            : memory::desc(output_dims_mkl_order,
-                         MklDnnType<T>(),
-                         this->data_format_mkldnn_);
+               ? original_output_mkl_shape.GetMklLayout()
+               : memory::desc(output_dims_mkl_order, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
   memory::desc ConfigureInputGradient(
-        const MklDnnShape& input_gradient_mkl_shape,
-        const Tensor& input_gradient_tensor,
-        MklDnnData<T>* input_gradient_dnn_data,
-        const memory::desc& original_output_md) {
+      const MklDnnShape& input_gradient_mkl_shape,
+      const Tensor& input_gradient_tensor,
+      MklDnnData<T>* input_gradient_dnn_data,
+      const memory::desc& original_output_md) {
     // Configure the gradient as is
-    memory::desc original_input_grad_md
-          = input_gradient_mkl_shape.IsMklTensor()
-          ? input_gradient_mkl_shape.GetMklLayout()
-          : memory::desc(TFShapeToMklDnnDimsInNCHW(
-                    input_gradient_tensor.shape(),
-                    this->data_format_tf_),
-                    MklDnnType<T>(), this->data_format_mkldnn_);
+    memory::desc original_input_grad_md =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetMklLayout()
+            : memory::desc(
+                  TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                            this->data_format_tf_),
+                  MklDnnType<T>(), this->data_format_mkldnn_);
 
     input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
-                &input_gradient_tensor);
+                                       &input_gradient_tensor);
 
     // Check to see if input grad diff dst is in the right format
     // Create a new memory descriptor with the same shape as the
     // original, but the format of the other tensors.
     memory::format original_output_format =
-            static_cast<memory::format>(original_output_md.data.format);
-    bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded(
-                                    original_output_format);
-    memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor()
-        ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
-        : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
-                    this->data_format_tf_);
-    memory::desc target_diff_dst_md = memory::desc(diff_dst_dims,
-        MklDnnType<T>(), original_output_format);
-
-    return grad_reorder_needed
-            ? target_diff_dst_md
-            : original_input_grad_md;
+        static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed =
+        input_gradient_dnn_data->IsReorderNeeded(original_output_format);
+    memory::dims diff_dst_dims =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                        this->data_format_tf_);
+    memory::desc target_diff_dst_md =
+        memory::desc(diff_dst_dims, MklDnnType<T>(), original_output_format);
+
+    return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 //-------------------------------------------------------------------
 // Utility functions
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index dc899d8c7ee231af403e6ca98ca60d94f78d0a81..0be8355afa470249d69c759faa735bafee31fb6d 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,29 +16,29 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::algorithm;
-using mkldnn::relu_forward;
-using mkldnn::relu_backward;
-using mkldnn::eltwise_relu;
 using mkldnn::eltwise_elu;
+using mkldnn::eltwise_relu;
 using mkldnn::eltwise_tanh;
+using mkldnn::prop_kind;
+using mkldnn::relu_backward;
+using mkldnn::relu_forward;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -58,7 +58,7 @@ struct MklReluHelpers {
   }
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
@@ -180,7 +180,6 @@ class MklReluOp : public OpKernel {
   } MklReluOpContext;
 };
 
-
 template <typename Device, typename T>
 class MklReluGradOp : public OpKernel {
  public:
@@ -214,10 +213,11 @@ class MklReluGradOp : public OpKernel {
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
-                   lt_grad), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
+                 E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert), E_SUCCESS);
+                                          mkl_buffer_convert),
+                 E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
@@ -325,7 +325,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
                                      mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope), E_SUCCESS);
+                                     negative_slope),
+           E_SUCCESS);
   Tensor mkl_tmp_input_buf_tensor;
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
@@ -348,7 +349,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
     }
 
     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                    mkl_context.output_shape.GetMklLayout())) / sizeof(T));
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
   } else {
@@ -361,22 +363,22 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.relu_res[dnnResourceDiffSrc] =
       static_cast<void*>(output->flat<T>().data());
 
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd,
-                          mkl_context.relu_res),
-                          E_SUCCESS);
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
   mkl_context.MklCleanup();
 }
 
 
-#else  // INTEL_MKL_DNN
+
+#else  // INTEL_MKL_ML
+
 
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
  public:
   ~MklReluOpBase() {}
 
-  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
@@ -413,12 +415,12 @@ class MklReluOpBase : public OpKernel {
 
       T alpha = 0, beta = 0;
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
-      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+      auto relu_fwd_desc = relu_forward::desc(
+          prop_kind::forward_training,
           // Operator memory descriptor is same as user memory descriptor.
-                                              alg_kind, src.GetUsrMemDesc(),
-                                              alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
+          alg_kind, src.GetUsrMemDesc(), alpha, beta);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -431,7 +433,7 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
                                   dnn_shape_src.GetSizesAsMklDnnDims(),
                                   dnn_shape_src.GetTfDataFormat());
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
@@ -445,34 +447,32 @@ class MklReluOpBase : public OpKernel {
 
       // execute net
       std::vector<primitive> net;
-      auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(),
-                                   dst.GetOpMem());
+      auto relu_fwd =
+          relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem());
       net.push_back(relu_fwd);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                        error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 };
 
-
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluGradOpBase : public OpKernel {
  public:
   ~MklReluGradOpBase() {}
 
-  explicit MklReluGradOpBase(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklReluGradOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
-  void Compute(OpKernelContext* context)  {
+  void Compute(OpKernelContext* context) {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnData<T> src(&cpu_engine);
@@ -483,9 +483,9 @@ class MklReluGradOpBase : public OpKernel {
       const size_t src_index = 1;       // index of src input tensor
       const size_t diff_src_index = 0;  // index of diff_src output tensor
 
-      const Tensor& src_tensor      = MklGetInput(context, src_index);
+      const Tensor& src_tensor = MklGetInput(context, src_index);
       const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
-      Tensor* diff_src_tensor       = nullptr;
+      Tensor* diff_src_tensor = nullptr;
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, src_index, &dnn_shape_src);
@@ -526,25 +526,25 @@ class MklReluGradOpBase : public OpKernel {
         src_md = dnn_shape_src.GetMklLayout();
 
         memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat();
-        auto src_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                    src_mkl_data_format);
+        auto src_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src_mkl_data_format);
         auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(),
                                                        src_tf_data_format);
-        diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
-                                   src_mkl_data_format);
+        diff_dst_md =
+            memory::desc(diff_dst_dims, MklDnnType<T>(), src_mkl_data_format);
       } else if (!dnn_shape_src.IsMklTensor() &&
-                  dnn_shape_diff_dst.IsMklTensor()) {
+                 dnn_shape_diff_dst.IsMklTensor()) {
         // Same comment as above.
         diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
 
         memory::format diff_dst_mkl_data_format =
-          dnn_shape_diff_dst.GetTfDataFormat();
-        auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                          diff_dst_mkl_data_format);
+            dnn_shape_diff_dst.GetTfDataFormat();
+        auto diff_dst_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format);
         auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
                                                   diff_dst_tf_data_format);
-        src_md = memory::desc(src_dims, MklDnnType<T>(),
-                              diff_dst_mkl_data_format);
+        src_md =
+            memory::desc(src_dims, MklDnnType<T>(), diff_dst_mkl_data_format);
       } else {
         // If both the inputs are in MKL format, we use Mkl layout of the input
         // tensors.
@@ -572,12 +572,12 @@ class MklReluGradOpBase : public OpKernel {
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
       auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
                                               alg_kind, src_md, alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
-      auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md,
-                                                alpha, beta);
-      auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
-                                                cpu_engine, *relu_fwd_pd);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
+      auto relu_bwd_desc =
+          relu_backward::desc(alg_kind, common_md, common_md, alpha, beta);
+      auto relu_bwd_pd = relu_backward::primitive_desc(
+          relu_bwd_desc, cpu_engine, *relu_fwd_pd);
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -590,33 +590,32 @@ class MklReluGradOpBase : public OpKernel {
         dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
                                        dnn_shape_src.GetSizesAsMklDnnDims(),
                                        dnn_shape_src.GetTfDataFormat());
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
       }
       AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
-                                 tf_shape_diff_src, dnn_shape_diff_src);
+                                tf_shape_diff_src, dnn_shape_diff_src);
 
       // diff_src memory descriptor is same as memory descriptor for both
       // inputs.
       diff_src.SetUsrMem(common_md, diff_src_tensor);
 
       PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
-     } catch (mkldnn::error &e) {
-       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-       OP_REQUIRES_OK(context,
-                      errors::Aborted("Operation received an exception:",
-                                      error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
   void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
-                  diff_dst) {
+                            MklDnnData<T>* src, MklDnnData<T>* diff_src,
+                            MklDnnData<T>* diff_dst) {
     std::vector<primitive> net;
 
     // Check if we need to reorder original input tensors into common_md layout
@@ -632,14 +631,13 @@ class MklReluGradOpBase : public OpKernel {
   }
 };
 
-
 template <typename Device, typename T>
 class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluOp() {}
 
-  explicit MklReluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -649,15 +647,15 @@ class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
                               src_tensor.shape(), dnn_shape_dst);
     void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
-              std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+        std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
     return;
   }
 };
@@ -667,14 +665,14 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluGradOp() {}
 
-  explicit MklReluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -687,11 +685,11 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  ((static_cast<T*>(user_i))[0] > 0);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 };
@@ -701,8 +699,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluOp() {}
 
-  explicit MklEluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -712,8 +710,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -734,14 +732,14 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluGradOp() {}
 
-  explicit MklEluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -754,9 +752,9 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
     // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
     T feature = (static_cast<T*>(user_i))[0];
     if (feature > 0) {
@@ -773,8 +771,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhOp() {}
 
-  explicit MklTanhOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -784,8 +782,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -795,7 +793,7 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    (static_cast<T*>(out_o))[0] = (e1 - e2)/(e1 + e2);
+    (static_cast<T*>(out_o))[0] = (e1 - e2) / (e1 + e2);
     return;
   }
 };
@@ -805,14 +803,14 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhGradOp() {}
 
-  explicit MklTanhGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -825,16 +823,16 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     // gradient of tanh(x) = 1 - tanh(x)^2
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    T tanh = (e1 - e2)/(e1 + e2);
+    T tanh = (e1 - e2) / (e1 + e2);
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  (1 - tanh * tanh);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * (1 - tanh * tanh);
   }
 };
 
@@ -854,16 +852,16 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // register dnn kernels for supported operations and supported types
-#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
-  REGISTER_KERNEL_BUILDER(Name("_MklElu")                          \
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklEluOp<CPUDevice, type>);              \
-  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                      \
+                          MklEluOp<CPUDevice, type>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                       \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
@@ -888,4 +886,3 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index b41e529357b2e93570377aaf350c99e0c8f2bd3c..5dbc4a2709e2bc379ae3b9aa68ed14f3d6893e7c 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 #endif
@@ -40,7 +40,7 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -166,9 +166,9 @@ class MklReshapeOp : public OpKernel {
     MklDnnShape mkl_shape_input;
     GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
     bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
-    const int64 nelems = input_in_mkl_format ?
-                         mkl_shape_input.GetTfShape().num_elements()
-                         : input_tensor.NumElements();
+    const int64 nelems = input_in_mkl_format
+                             ? mkl_shape_input.GetTfShape().num_elements()
+                             : input_tensor.NumElements();
 
     // Preliminary validation of sizes.
     OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
@@ -210,11 +210,11 @@ class MklReshapeOp : public OpKernel {
               product));
       shape.set_dim(unknown_index, missing);
     }
-    OP_REQUIRES(context, shape.num_elements() == nelems,
-                errors::InvalidArgument("Input to reshape is a tensor with ",
-                                        nelems,
-                                        " values, but the requested shape has ",
-                                        shape.num_elements()));
+    OP_REQUIRES(
+        context, shape.num_elements() == nelems,
+        errors::InvalidArgument("Input to reshape is a tensor with ", nelems,
+                                " values, but the requested shape has ",
+                                shape.num_elements()));
 
     if (input_in_mkl_format) {
       TensorShape& shape_to = shape;
@@ -237,38 +237,38 @@ class MklReshapeOp : public OpKernel {
           // need to update MklDnnShape object associated with the input
           // tensor to reflect the shape change expected by reshape.
           if (!SkipReorder(mkl_shape_input, shape_to)) {
-              // If dimensions that are being expanded or collapsed are not
-              // maintained contiguously by MKLDNN, then we use reorder.
-
-              // Get Mkl layout of input tensor.
-              auto input_mkl_md = mkl_shape_input.GetMklLayout();
-              // Set input Mkl layout as the user layout.
-              dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
-              // Get expected Tensorflow layout of input tensor.
-              auto output_tf_md = mkl_shape_input.GetTfLayout();
-              auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                         cpu_engine);
-
-              Tensor* output_tensor = nullptr;
-              MklShape mkl_shape_output;
-              mkl_shape_output.SetMklTensor(false);
-              // We allocate output tensor in the shape expected by Reshape.
-              AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
-                                        shape_to, mkl_shape_output);
-
-              // Insert reorder between Mkl layout and TensorFlow layout if
-              // needed. If reorder is not needed but reshape is needed (since
-              // shape_from != shape_to), then we just copy input tensor to
-              // output tensor with target shape (we cannot forward Mkl layout
-              // in such case because shape has changed.)
-              std::vector<primitive> net;
-              if (dnn_data_input.CheckReorderToOpMem(output_tf_pd,
-                       output_tensor, &net)) {
-                stream(stream::kind::eager).submit(net).wait();
-              } else {
-                output_tensor->CopyFrom(input_tensor, shape_to);
-              }
-              return;
+            // If dimensions that are being expanded or collapsed are not
+            // maintained contiguously by MKLDNN, then we use reorder.
+
+            // Get Mkl layout of input tensor.
+            auto input_mkl_md = mkl_shape_input.GetMklLayout();
+            // Set input Mkl layout as the user layout.
+            dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+            // Get expected Tensorflow layout of input tensor.
+            auto output_tf_md = mkl_shape_input.GetTfLayout();
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
+
+            Tensor* output_tensor = nullptr;
+            MklShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(false);
+            // We allocate output tensor in the shape expected by Reshape.
+            AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                      shape_to, mkl_shape_output);
+
+            // Insert reorder between Mkl layout and TensorFlow layout if
+            // needed. If reorder is not needed but reshape is needed (since
+            // shape_from != shape_to), then we just copy input tensor to
+            // output tensor with target shape (we cannot forward Mkl layout
+            // in such case because shape has changed.)
+            std::vector<primitive> net;
+            if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor,
+                                                   &net)) {
+              stream(stream::kind::eager).submit(net).wait();
+            } else {
+              output_tensor->CopyFrom(input_tensor, shape_to);
+            }
+            return;
           } else {
             // If dimensions that are being expanded or collapsed are
             // maintained contiguously by MKLDNN, then we skip reorder, just
@@ -276,10 +276,10 @@ class MklReshapeOp : public OpKernel {
             // Tensorflow tensor as it is to the output.
             auto output_dims = TFShapeToMklDnnDims(shape_to);
             auto output_strides = CalculateTFStrides(output_dims);
-            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(output_dims,
-                                                               output_strides);
-            auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                       cpu_engine);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(
+                output_dims, output_strides);
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
 
             // Set MklDnnShape
             MklDnnShape mkl_shape_output;
@@ -291,18 +291,17 @@ class MklReshapeOp : public OpKernel {
 
             // We now simply forward input Mkl tensor to output and change its
             // output MklDnnShape object.
-            ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx,
-                                              kOutputSlotIdx, mkl_shape_output);
+            ForwardMklTensorInToOutWithMklShape(
+                context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output);
             return;
           }
-        } catch (mkldnn::error &e) {
+        } catch (mkldnn::error& e) {
           string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-          OP_REQUIRES_OK(context,
-                   errors::Aborted("Operation received an exception:",
-                      error_msg));
+                             ", message: " + string(e.message) + ", in file " +
+                             string(__FILE__) + ":" + std::to_string(__LINE__);
+          OP_REQUIRES_OK(
+              context,
+              errors::Aborted("Operation received an exception:", error_msg));
         }
       }
     } else {
@@ -313,7 +312,7 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
  private:
   const int kInputSlotIdx = 0;
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index c46eabdde103913a712c3d058aa23a627d19f5ea..aceef1e234eff3660b33f5a091a2cd10e25ea2f9 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -156,5 +156,5 @@ TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index c4d5a45d3caff0f59b1ecc61f95dd26fe16fd06b..5fafa14b5dbf49d0c9902af4e38653b48d1f179b 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 using mkldnn::stream;
 #endif
 
@@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index 17f2af550f248a6924bb3d1e7546eca84d4c1e51..0e820bbb6208ae9c13ac2fb33f67590b9e66ba7e 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -71,10 +71,10 @@ class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
                                         filter.shape().DebugString()));
 
     const int32 in_depth = input.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
     const int32 batch = input.dim_size(0);
     const int32 input_rows = input.dim_size(1);
     const int32 input_cols = input.dim_size(2);
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 64bdef0008f20f3947e990e30e2af7b93a69d50c..5d28b87e6bb8c0f51653fc005a2f62734a44d321 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -92,13 +92,11 @@ static inline bool IOUGreaterThanThreshold(
   return iou > iou_threshold;
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context,
-                           const Tensor& boxes,
-                           const Tensor& scores,
-                           const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
+                           const Tensor& scores, const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
@@ -106,10 +104,8 @@ void DoNonMaxSuppressionOp(OpKernelContext* context,
     return;
   }
 
-  const int output_size =
-      std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data =
-      boxes.tensor<float, 2>();
+  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@@ -181,8 +177,7 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -197,10 +192,9 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                iou_threshold.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index fdbcf05b89ddf122eee9e0133651355edbb1ba5a..67d9217b9502a30f5727b6a91fbf36da872ab972 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -43,9 +43,10 @@ class NonMaxSuppressionOpTest : public OpsTestBase {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
@@ -58,7 +59,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f,  1, -0.1f,
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
                             0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
@@ -71,9 +72,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {2});
   TF_ASSERT_OK(RunOpKernel());
@@ -85,9 +87,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {30});
   TF_ASSERT_OK(RunOpKernel());
@@ -134,9 +137,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) {
 
 TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
   AddInputFromArray<int>(TensorShape({}), {30});
   Status s = RunOpKernel();
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index da825e408c24617862e8613c6b63ed1a51944041..7f12eb953a31ec667a5f3cee379bd3d1970b3a56 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -16,15 +16,15 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #include "tensorflow/core/kernels/nth_element_op.h"
 
+#include <algorithm>
+#include <iostream>
+#include <vector>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include <vector>
-#include <algorithm>
-#include <iostream>
 
 namespace tensorflow {
 
@@ -54,8 +54,9 @@ class NthElementOp : public OpKernel {
                 errors::InvalidArgument("Input must be >= 1-D, got shape ",
                                         input_in.shape().DebugString()));
     // The last dimension of input tensor must be greater than N.
-    OP_REQUIRES(context, input_in.dim_size(num_dims-1) > n,
-                errors::InvalidArgument("Input must have at least n+1 columns"));
+    OP_REQUIRES(
+        context, input_in.dim_size(num_dims - 1) > n,
+        errors::InvalidArgument("Input must have at least n+1 columns"));
 
     // std::nth_element only support the nth-smallest selection.
     if (reverse_) {
@@ -64,7 +65,7 @@ class NthElementOp : public OpKernel {
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1].
     TensorShape out_shape;
-    for (int i = 0; i < num_dims-1; ++i) {
+    for (int i = 0; i < num_dims - 1; ++i) {
       out_shape.AddDim(input_in.dim_size(i));
     }
     Tensor* output_tensor = nullptr;
@@ -83,32 +84,28 @@ namespace functor {
 
 template <typename T>
 struct NthElementFunctor<CPUDevice, T> {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n,
-                   bool reverse) {
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n, bool reverse) {
     const T* input = input_tensor.flat<T>().data();
     T* output = output_tensor.flat<T>().data();
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1],
     // then num_rows = d1*d2...dk-1, last_dim = dk.
     const int num_rows = output_tensor.NumElements();
-    const int last_dim = input_tensor.dim_size(input_tensor.dims()-1);
+    const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);
 
     // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start,
-                                                         int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
       // std::nth_element would rearrange the array, so we need a new buffer.
       std::vector<T> buf(last_dim);
 
       for (int b = start; b < limit; ++b) {
         // Copy from one row of elements to buffer
         const T* input_start = input + b * last_dim;
-        const T* input_end = input + (b+1) * last_dim;
+        const T* input_end = input + (b + 1) * last_dim;
         std::copy(input_start, input_end, buf.begin());
 
-        std::nth_element(buf.begin(), buf.begin()+n, buf.end());
+        std::nth_element(buf.begin(), buf.begin() + n, buf.end());
         // The element placed in the nth position is exactly the element that
         // would occur in this position if the range was fully sorted.
         output[b] = buf[n];
@@ -116,9 +113,9 @@ struct NthElementFunctor<CPUDevice, T> {
     };
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    // The average time complexity of partition-based nth_element (BFPRT) is O(n),
-    // althought the worst time complexity could be O(n^2).
-    // Here, 20 is a empirical factor of cost_per_unit.
+    // The average time complexity of partition-based nth_element (BFPRT) is
+    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
   }
@@ -126,7 +123,6 @@ struct NthElementFunctor<CPUDevice, T> {
 
 }  // namespace functor
 
-
 #define REGISTER_NTHOP(T)                                           \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("NthElement").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
@@ -136,4 +132,3 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_NTHOP);
 #undef REGISTER_NTHOP
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h
index 11a6c996b093fa7255a230122f64eb1054789453..e7d25daecc74a6d7b178034d5d78776a390ffe04 100644
--- a/tensorflow/core/kernels/nth_element_op.h
+++ b/tensorflow/core/kernels/nth_element_op.h
@@ -26,10 +26,8 @@ namespace functor {
 
 template <typename Device, typename T>
 struct NthElementFunctor {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n);
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index 49fd4bdebad420d8e848b0491a764d976f4557cd..647515ae38ab5530b69fa135257584eea531d46c 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -19,16 +19,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/one_hot_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/one_hot_op.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPEC_INDEX(T, TI)                   \
-  template class generator::OneGenerator<T, TI>;       \
+#define DEFINE_GPU_SPEC_INDEX(T, TI)             \
+  template class generator::OneGenerator<T, TI>; \
   template struct functor::OneHot<GPUDevice, T, TI>;
 
 #define DEFINE_GPU_SPEC(T)         \
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 9d53882deef89230bd39d8318f11d84269406f20..13427d71ff6841a85c31d3bf42c038f6413c1fe6 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -218,7 +218,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 0
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}},
+      {{0, 3, 3, 2, 0}, {0, 3}},
+      {{1, 3, 3, 2, 0}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -228,7 +229,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}},
+      {{0, 3, 3, 2, 1}, {0, 2}},
+      {{1, 3, 3, 2, 1}, {1, 2}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -258,7 +260,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) {
 // in_size = 3, ksize = 3, stride = 3, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}},
+      {{0, 3, 3, 3, 1}, {0, 2}},
+      {{1, 3, 3, 3, 1}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -348,8 +351,8 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) {
 
 TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
-  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero,
-  // a multidimensional tensor is always aligned.
+  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is
+  // nonzero, a multidimensional tensor is always aligned.
   Tensor t(DT_FLOAT, TensorShape({3, 4}));
   int64 start = 1;
   int64 end = 2;
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 2033fbf5dc3f238b665c6f4afced06e90c81bb7c..e0ae5de0f45063dd55fe567519942437e4ea889a 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -123,7 +123,7 @@ class PackOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index b232ba16a76877b4d9f0e8c24e7ccd17a9bc0856..0ab9ff9f650e137017b49d5d279f1a28ff45fa29 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -95,9 +95,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         int64 sample = b * samples_per_batch;
 
         // On GPU, this check will just fill samples with NAN if it fails.
-        OP_REQUIRES(ctx, stddev > T(0) && minval < maxval &&
-                             (Eigen::numext::isfinite(minval) ||
-                              Eigen::numext::isfinite(maxval)),
+        OP_REQUIRES(ctx,
+                    stddev > T(0) && minval < maxval &&
+                        (Eigen::numext::isfinite(minval) ||
+                         Eigen::numext::isfinite(maxval)),
                     errors::InvalidArgument("Invalid parameters"));
 
         int numIterations = 0;
@@ -118,8 +119,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         // Determine the method to use.
         const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4));
         const T cutoff =
-            T(2) * Eigen::numext::exp(
-                       T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) /
+            T(2) *
+            Eigen::numext::exp(T(0.5) +
+                               (normMin * (normMin - sqrtFactor)) / T(4)) /
             (normMin + sqrtFactor);
         const T diff = normMax - normMin;
         if (diff < cutoff) {
@@ -309,30 +311,34 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
     } else {
       // Parameters must be broadcastable to the shape [num_batches].
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(means_tensor.shape()) ||
-                   means_tensor.dim_size(0) == 1 ||
-                   means_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(means_tensor.shape()) ||
+              means_tensor.dim_size(0) == 1 ||
+              means_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input means should have length 1 or shape[0], got shape: ",
               means_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
-                   stddevs_tensor.dim_size(0) == 1 ||
-                   stddevs_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
+              stddevs_tensor.dim_size(0) == 1 ||
+              stddevs_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input stddevs should have length 1 or shape[0], got shape: ",
               stddevs_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
-                   minvals_tensor.dim_size(0) == 1 ||
-                   minvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
+              minvals_tensor.dim_size(0) == 1 ||
+              minvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input minvals should have length 1 or shape[0], got shape: ",
               minvals_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
-                   maxvals_tensor.dim_size(0) == 1 ||
-                   maxvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
+              maxvals_tensor.dim_size(0) == 1 ||
+              maxvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input maxvals should have length 1 or shape[0], got shape: ",
               maxvals_tensor.shape().DebugString()));
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 933de65c15a772154ce439cc54489c4a29c42ea5..ddfeb1bb7903e4dd66f557df7702c083a6b62899 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -202,12 +202,13 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
                   typename TTypes<T>::Flat output) {
     const auto config = GetCudaLaunchConfig(num_elements, d);
 
-    TruncatedNormalKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        gen, output.data(), num_batches, samples_per_batch, num_elements,
-        means.data(), means.dimension(0) == 1, stddevs.data(),
-        stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1,
-        maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations);
+    TruncatedNormalKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            gen, output.data(), num_batches, samples_per_batch, num_elements,
+            means.data(), means.dimension(0) == 1, stddevs.data(),
+            stddevs.dimension(0) == 1, minvals.data(),
+            minvals.dimension(0) == 1, maxvals.data(),
+            maxvals.dimension(0) == 1, kMaxIterations);
   };
 };
 
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index 6b599612ad7fde0bac44282521be26581aa752b8..dd41744f023dd06c66e0f5a921880dbe6d5b843d 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index a406317213f51d557d7b5a9942260156c0fe6369..01bcfede1e8d1f1a71059c5171f8a4d7290d7a5b 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -258,7 +258,7 @@ struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -431,7 +431,7 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -833,7 +833,7 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
index c1bc5af4986ee7102929af3e9b37a7301830de0e..b4bead2456d58c636301678d8a81864b25e3e85b 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -281,12 +281,11 @@ class MaxPool3DGradSYCL {
 
     const T* input_data_n =
         input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    const T* output_data_n =
-        output_data +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ *
+                                               p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
@@ -678,9 +677,9 @@ class AvgPool3DGradSYCL {
     n /= p_.in_planes_;
 
     T gradient = T(0);
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index e3131b804f2412c890016dcfb3aace1648729172..fc7cb437b8f583a811427deaf52a94d9ef996f37 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -195,7 +195,6 @@ class MaxPoolingOp : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
@@ -443,7 +442,6 @@ class MaxPoolingV2Op : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index d148c9f78d61d9b1840cc7a14f82c9254a4d434c..176720c22cc54ea8d9b79dacfc77f6cd2532f93a 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -385,8 +385,12 @@ void TestQuantizedToFloatInPlaceUsingEigen(
   // These are the float values we're going to test the conversions on.
   typedef std::pair<float, float> FPair;
   for (FPair min_and_max : std::vector<FPair>{
-           FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f),
-           FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f),
+           FPair(-255.0f, 255.0f),
+           FPair(-1.0f, 1.0f),
+           FPair(-1.0f, 255.0f),
+           FPair(0.0f, 1e6),
+           FPair(0.0f, 1.0f),
+           FPair(-31.0f, 13.0f),
            FPair(-5.89505e+08, 5.89505e+08),
        }) {
     const float f_min = min_and_max.first;
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 1363c7e325b6a251d97039df3de271e92f59f6c0..3b09ea2527d8b401941c6ef0951c620edd0c5217 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -71,7 +71,8 @@ struct QuantizeAndDequantizeOneScaleImpl {
         out.device(d) =
             ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
                  scale +
-             T(0.5)).floor() *
+             T(0.5))
+                    .floor() *
                 inverse_scale +
             min_range;
       } else {
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index d2cc55a94ddd7b3e31a5cfc841de25519abe2746..57982bdf76e3969b31f4ee73cbf47c564b2b53e6 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -250,7 +250,8 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) {
   Tensor expected(allocator(), DT_QINT32, TensorShape({element_count}));
   test::FillValues<qint32>(&expected,
                            {
-                               std::numeric_limits<int32>::min(), 0,
+                               std::numeric_limits<int32>::min(),
+                               0,
                                static_cast<int32>(1.0f * (1 << 23)),
                                static_cast<int32>(1.25f * (1 << 23)),
                                static_cast<int32>(1.75f * (1 << 23)),
diff --git a/tensorflow/core/kernels/quantized_batch_norm_op.cc b/tensorflow/core/kernels/quantized_batch_norm_op.cc
index 18d83b414940504fcb4e031f3304412da3baf51b..b03da7ad17fab45086438691a1013b2acf54ee87 100644
--- a/tensorflow/core/kernels/quantized_batch_norm_op.cc
+++ b/tensorflow/core/kernels/quantized_batch_norm_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index d67f1ab3ec28934bc08c11997a8b2f448c30ad91..b03ac8e87dac8fabe0d45d8685ec4fa5fd642519 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -135,8 +135,8 @@ class QuantizedConcatOp : public OpKernel {
           context, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == concat_dim) {
           continue;
@@ -145,8 +145,8 @@ class QuantizedConcatOp : public OpKernel {
             context, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 1921b83d12c0688a96bad0c561080a0189e49bbe..5b3570edff5fee4b77d02684ef3da2af1d5f14b1 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -278,10 +278,9 @@ class Im2ColConvFunctor {
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };
-    OP_REQUIRES_OK(
-        context,
-        context->resource_manager()->LookupOrCreate(
-            "Conv2d", "im2col_buffer", &im2col_buffer_resource, creator));
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
     // This means that multiple ops can't be run simultaneously on different
     // threads, because we have a single shared resource. The platforms this is
     // aimed at have intra-op parallelism as their focus though, so it shouldn't
diff --git a/tensorflow/core/kernels/quantized_instance_norm.cc b/tensorflow/core/kernels/quantized_instance_norm.cc
index c29f534f31b524f6e1d9ec09750b6de265ec10f8..d62094cc9fad85536edba8bb3854e71870df217c 100644
--- a/tensorflow/core/kernels/quantized_instance_norm.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm.cc
@@ -278,10 +278,10 @@ class QuantizedInstanceNorm : public OpKernel {
     float input_max = context->input(2).flat<float>()(0);
     float input_scale = (input_max - input_min) / 255.0f;
 
-    OP_REQUIRES(
-        context, input_min < input_max,
-        errors::InvalidArgument("input_min must be less than input_max : ",
-                                input_min, " >= ", input_max));
+    OP_REQUIRES(context, input_min < input_max,
+                errors::InvalidArgument(
+                    "input_min must be less than input_max : ", input_min,
+                    " >= ", input_max));
 
     auto input_tensor = input.tensor<quint8, 4>();
     auto N = input_tensor.dimension(0);
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index afb30d5f627feab1a009ec84c5f0bb9f851766e0..da8c46dc5162f30ea129e71fb5a1c81ee594718d 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel {
 
     OP_REQUIRES(context,
                 a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size-compatible: In[0]: ", a.shape().DebugString(),
+                    ", In[1]: ", b.shape().DebugString()));
 
     OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)),
                 errors::InvalidArgument("shift_c must be between 0 and 31, "
diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc
index 535b5115c34e61333a0e7e1fdbfbe2b35571bf6c..c9f05dbc10bb8bcd3acae2d2ca0c149ac620bb79 100644
--- a/tensorflow/core/kernels/quantized_matmul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc
@@ -206,17 +206,32 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // We have set the transpose_a flag to true, so the matrix is transposed, and
   // for filling the values the in-memory storage order is effectively
   // column major, rather than the default row-major.
-  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}),
-                            {
-                                11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                            });
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}), {
+                                                               11,
+                                                               10,
+                                                               9,
+                                                               8,
+                                                               7,
+                                                               6,
+                                                               5,
+                                                               4,
+                                                               3,
+                                                               2,
+                                                               1,
+                                                               0,
+                                                           });
 
   // The B matrix is:
   // |   1 |   4|
   // |   2 |   5|
   // |   3 |   6|
   AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {
-                                                               1, 4, 2, 5, 3, 6,
+                                                               1,
+                                                               4,
+                                                               2,
+                                                               5,
+                                                               3,
+                                                               6,
                                                            });
   AddInputFromArray<float>(TensorShape({1}), {-12.0f});
   AddInputFromArray<float>(TensorShape({1}), {243.0f});
@@ -238,10 +253,16 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // |  -50 | -113 |
   // |  -56 | -128 |
   Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols}));
-  test::FillValues<qint32>(&expected,
-                           {
-                               -38, -83, -44, -98, -50, -113, -56, -128,
-                           });
+  test::FillValues<qint32>(&expected, {
+                                          -38,
+                                          -83,
+                                          -44,
+                                          -98,
+                                          -50,
+                                          -113,
+                                          -56,
+                                          -128,
+                                      });
   test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index eaa5e667f7d5681e886a5de9e64a055ec175cf1e..3c7536e037396c338663ce0136832acb87bef401 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -298,9 +298,8 @@ class QuantizedMulOp : public OpKernel {
       return;
     }
     Tensor* z;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_output(0, BCast::ToShape(bcast.output_shape()), &z));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, BCast::ToShape(bcast.output_shape()), &z));
 
     // Make sure that we have valid quantization ranges for the input buffers.
     // If the difference between the min and max is negative or zero, it makes
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index b0550c8260c0ec7e40eeab4e07a5ecaf4cb8e32b..a4e407c7a94c9c2e11808eeb4533be5c346fb6f4 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -188,11 +188,12 @@ void TestManualScalar() {
       10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10},
       {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
       3.0f);
-  TestMul({1}, {10.0f}, -100.0f, 100.0f, {10},
-          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
-          10.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f,
-                        90.0f, 100.0f},
-          3.0f);
+  TestMul(
+      {1}, {10.0f}, -100.0f, 100.0f, {10},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {10},
+      {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
+      3.0f);
 }
 
 void TestScalar() {
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 330d161c32bc1a48b671765cacc21618545fa71a..de495c19cba300fbd034cda01adfd0518548ce68 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -39,8 +39,8 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
     return errors::Internal(
         "HandleSliceToElement Cannot copy slice: number of elements does not "
         "match.  Shapes are: [element]: ",
-        element->shape().DebugString(), ", [parent slice]: ",
-        chip_shape.DebugString());
+        element->shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
   }
   auto parent_as_matrix = parent.flat_outer_dims<T>();
   element->flat<T>() = parent_as_matrix.chip(index, 0);
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 17831b74370bcd21cf7772f0ea6809ee840511c3..46a02854d732d6da657414a4e42b535f72ea7b64 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -428,13 +428,14 @@ REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp);
 class QueueIsClosedOp : public QueueOpKernel {
  public:
   explicit QueueIsClosedOp(OpKernelConstruction* context)
-     : QueueOpKernel(context) {}
+      : QueueOpKernel(context) {}
 
  protected:
   void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                     DoneCallback callback) override {
     Tensor* Tqueue_is_closed = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
     Tqueue_is_closed->flat<bool>().setConstant(queue->is_closed());
     callback();
   }
@@ -443,8 +444,10 @@ class QueueIsClosedOp : public QueueOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp);
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
 
 class FakeQueueOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc
index ba94d6be5caff7245e08ca22b5f057e81f30db74..554909760aa8a6bebe7e2988cd995f9373e1cc33 100644
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/random_crop_op.cc
@@ -68,10 +68,10 @@ class RandomCropOp : public OpKernel {
     // Edge case. The target dimensions are larger then the image, so
     // zero-pad the image. This guarantees that the image will *always*
     // be [target_height, target_width] in size.
-    OP_REQUIRES(
-        context, width >= target_width,
-        errors::FailedPrecondition("width must be >= target_width: width = ",
-                                   width, ", target_width = ", target_width));
+    OP_REQUIRES(context, width >= target_width,
+                errors::FailedPrecondition(
+                    "width must be >= target_width: width = ", width,
+                    ", target_width = ", target_width));
     OP_REQUIRES(context, height >= target_height,
                 errors::FailedPrecondition(
                     "height must be >= target_height: height = ", height,
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 55a8b9c9b67455483689a135306017bed8974ade..78ff7948fbf1b6406b2faca1d94acd7ea3325437 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -50,7 +50,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -271,9 +271,10 @@ class RandomGammaOp : public OpKernel {
     const Tensor& shape_t = ctx->input(0);
     const Tensor& alpha_t = ctx->input(1);
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(shape_t.shape()) &&
-                         (shape_t.dtype() == DataType::DT_INT32 ||
-                          shape_t.dtype() == DataType::DT_INT64),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(shape_t.shape()) &&
+                    (shape_t.dtype() == DataType::DT_INT32 ||
+                     shape_t.dtype() == DataType::DT_INT64),
                 errors::InvalidArgument(
                     "shape must be a vector of {int32,int64}, got shape: ",
                     shape_t.DebugString()));
@@ -325,7 +326,7 @@ class RandomGammaOp : public OpKernel {
     // avoid a couple flops which can be done on a per-alpha basis.
 
     auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       using Eigen::numext::exp;
       using Eigen::numext::log;
       using Eigen::numext::pow;
@@ -448,40 +449,40 @@ class RandomGammaOp : public OpKernel {
 
 }  // namespace
 
-#define REGISTER(TYPE)                                                      \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE> >;  \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice,                                                            \
-      random::TruncatedNormalDistribution<                                  \
-          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >;       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomUniform")                                                 \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomStandardNormal")                                          \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("TruncatedNormal")                                               \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<                                                       \
-          CPUDevice,                                                        \
-          random::TruncatedNormalDistribution<                              \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);  \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),     \
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;     \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE>>;      \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice,                                                               \
+      random::TruncatedNormalDistribution<                                     \
+          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>;           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          CPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);      \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
       RandomGammaOp<TYPE>)
 
 #define REGISTER_INT(IntType)                                   \
@@ -504,33 +505,33 @@ TF_CALL_int64(REGISTER_INT);
 
 #if GOOGLE_CUDA
 
-#define REGISTER(TYPE)                                              \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomUniform")                                         \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::UniformDistribution<        \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomStandardNormal")                                  \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::NormalDistribution<         \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("TruncatedNormal")                                       \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<                                               \
-          GPUDevice,                                                \
-          random::TruncatedNormalDistribution<                      \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+#define REGISTER(TYPE)                                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          GPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
 #define REGISTER_INT(IntType)                                   \
   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
@@ -565,13 +566,12 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     const size_t kGroupSize = Distribution::kResultElementCount;
@@ -597,7 +597,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
     const typename Distribution::ResultType samples = dist_(&gen_);
     for (size_t i = 0; i < kGroupSize; ++i) {
       if (offset >= size) {
-          return;
+        return;
       }
       data[offset] = samples[i];
       ++offset;
@@ -610,17 +610,15 @@ struct FillPhiloxRandomKernel<Distribution, false> {
   Distribution dist_;
 };
 
-
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     using random::PhiloxRandom;
@@ -628,9 +626,9 @@ struct FillPhiloxRandomKernel<Distribution, true> {
 
     const size_t kReservedSamplesPerOutput = 256;
     const size_t kGroupSize = Distribution::kResultElementCount;
-    const size_t kGeneratorSkipPerOutputGroup = kGroupSize *
-                                                kReservedSamplesPerOutput /
-                                                PhiloxRandom::kResultElementCount;
+    const size_t kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
     const size_t total_item_count = item.get_global_range();
@@ -674,10 +672,9 @@ class FillRandomKernel;
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
-    OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen,
-    typename Distribution::ResultElementType* data, int64 size,
-    Distribution dist) {
-
+    OpKernelContext* context, const SYCLDevice& device,
+    random::PhiloxRandom gen, typename Distribution::ResultElementType* data,
+    int64 size, Distribution dist) {
   const size_t group_size = device.maxSyclThreadsPerBlock();
   const size_t group_count = (size + group_size - 1) / group_size;
 
@@ -686,50 +683,52 @@ void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
   device.sycl_queue().submit([&](sycl::handler& cgh) {
     auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
 
-    FillPhiloxRandomKernel<Distribution, Distribution::kVariableSamplesPerOutput> task(access, gen, dist);
+    FillPhiloxRandomKernel<Distribution,
+                           Distribution::kVariableSamplesPerOutput>
+        task(access, gen, dist);
     cgh.parallel_for<class FillRandomKernel<Distribution>>(
-      sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)),
-      task
-    );
+        sycl::nd_range<1>(sycl::range<1>(group_count * group_size),
+                          sycl::range<1>(group_size)),
+        task);
   });
 }
 
-}
+}  // namespace functor
+
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                  \
+                                     random::PhiloxRandom, TYPE>>);            \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice,                                               \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          SYCLDevice,                                                          \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
-#define REGISTER(TYPE)                                                       \
-  template struct functor::FillPhiloxRandom<                                 \
-      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomUniform")                                                  \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomStandardNormal")                                           \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("TruncatedNormal")                                                \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<                                                        \
-          SYCLDevice,                                                        \
-          random::TruncatedNormalDistribution<                               \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
-
-#define REGISTER_INT(IntType)                                    \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")               \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("shape")               \
-                              .HostMemory("minval")              \
-                              .HostMemory("maxval")              \
-                              .TypeConstraint<IntType>("Tout"),  \
+#define REGISTER_INT(IntType)                                   \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
+                              .Device(DEVICE_SYCL)              \
+                              .HostMemory("shape")              \
+                              .HostMemory("minval")             \
+                              .HostMemory("maxval")             \
+                              .TypeConstraint<IntType>("Tout"), \
                           RandomUniformIntOp<SYCLDevice, IntType>);
 
 TF_CALL_float(REGISTER);
@@ -740,6 +739,6 @@ TF_CALL_int64(REGISTER_INT);
 #undef REGISTER
 #undef REGISTER_INT
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 7afa6974c6a9389782fbbcd39ddede2a97ecd566..3393b39faf4a25791b48af99a5e474f3e9bfbfce 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -222,9 +222,8 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
       block_size;
 
-  FillPhiloxRandomKernelLaunch<
-      Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size,
-                                                               dist);
+  FillPhiloxRandomKernelLaunch<Distribution>
+      <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
 };
 
 // Explicit instantiation of the GPU distributions functors
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index bf1d83ec7517d1bcfa9b88b482b983e6a2d3f7c4..64fb4a5c22848009743af6a577c719f206f022bb 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -103,7 +103,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
     typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
 
     auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       // Capturing "rng" by value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "rng" by reference and explicitly do a copy assignment.
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index e9695cfde30945c9c99db85f33e44030e5d45054..87fc94333162c4b721fa3608f282bf9d28fc792e 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -334,96 +334,95 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int32 queue_size = queues_[0].size();
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to RandomShuffleQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_back(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "RandomShuffleQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 queue_size = queues_[0].size();
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to RandomShuffleQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_back(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "RandomShuffleQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                if (!closed_) queue_size -= min_after_dequeue_;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int index = attempt->tuple[0].dim_size(0) -
-                                    attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            if (!closed_) queue_size -= min_after_dequeue_;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 36ca7f834f7b4fe7db1e2591189b1359231c7307..15ae4c1fc53b2b9bfe1d6085d2ecbc3659705b47 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -312,8 +312,7 @@ __global__ void ColumnReduceKernel(
   int col = blockIdx.x * 32 + threadIdx.x;
 
   value_type sum = initVal;
-  if (row < num_rows && col < num_cols)
-    sum = in[row * num_cols + col];
+  if (row < num_rows && col < num_cols) sum = in[row * num_cols + col];
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
@@ -366,8 +365,7 @@ __global__ void CleanupSegments(
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   value_type val = initVal;
-  if (tid < segment_size * num_cols)
-    val = partial_sums[tid];
+  if (tid < segment_size * num_cols) val = partial_sums[tid];
 
   typedef cub::WarpReduce<value_type> WarpReduce;
 
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index afad288cc00e0c3934318834d8dae8c181541212..d52358737fd121398ff2a4c95e417fd9b80987ab 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -31,7 +31,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                   \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -113,8 +113,7 @@ namespace functor {
                                                                                \
   template <>                                                                  \
   void Selu<GPUDevice, T>::operator()(                                         \
-      const GPUDevice& d,                                                      \
-      typename TTypes<T>::ConstTensor features,                                \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
       typename TTypes<T>::Tensor activations);                                 \
   extern template struct Selu<GPUDevice, T>;                                   \
                                                                                \
@@ -125,8 +124,6 @@ namespace functor {
       typename TTypes<T>::Tensor backprops);                                   \
   extern template struct SeluGrad<GPUDevice, T>;
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
@@ -157,8 +154,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
       SeluGradOp<GPUDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
@@ -192,10 +187,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
       Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
       SeluGradOp<SYCLDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 24b789c5437c78a76c708a6637b60376d5087682..3bc5ba8a50de22156aa631ee6404ddfe04b3a105 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -85,10 +85,9 @@ struct Relu6Grad {
     // make sure not to propagate the associated gradient
     // value. This allows "features" to be either the input or the output of
     // the relu6.
-    backprops.device(d) =
-        gradients *
-        ((features > static_cast<T>(0)) * (features < static_cast<T>(6)))
-            .template cast<T>();
+    backprops.device(d) = gradients * ((features > static_cast<T>(0)) *
+                                       (features < static_cast<T>(6)))
+                                          .template cast<T>();
   }
 };
 
@@ -161,8 +160,8 @@ struct SeluGrad {
     const auto scale = static_cast<T>(1.0507009873554804934193349852946);
     const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
     backprops.device(d) =
-        (activations < static_cast<T>(0)).select(
-            gradients * (activations + scale_alpha), gradients * scale);
+        (activations < static_cast<T>(0))
+            .select(gradients * (activations + scale_alpha), gradients * scale);
   }
 };
 
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 1a9cf4c6406d85bf26b43e0b9b855760a4888a4c..86e61bbcefc1ad2b103552101c17a05c3c3ede6e 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 9e10fec42321023d95f3ae8d32a5a1c8f2c7a94e..25a37d5e1af5835d56dedb50922967704500ad46 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -286,13 +286,14 @@ BM_ResizeBicubicDev(32, 128, 3);
 BM_ResizeBicubicDev(32, 512, 3);
 BM_ResizeBicubicDev(32, 1024, 3);
 
-#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                          \
-  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE *  \
-                            CHANNELS * 8 * 8);                                 \
-    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))         \
-        .Run(iters);                                                           \
-  }                                                                            \
+#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                         \
+  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(         \
+      int iters) {                                                            \
+    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE * \
+                            CHANNELS * 8 * 8);                                \
+    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))        \
+        .Run(iters);                                                          \
+  }                                                                           \
   BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS);
 
 BM_ResizeBicubicExpand(12, 48, 1);
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index a7da7a0777d0cb35ade6a04dfff4edf604c1a169..f82c3fcd9ff45e26d2f44408890fa760c64477e4 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -164,11 +164,11 @@ struct ResizeBilinear<GPUDevice, T> {
     if (total_count == 0) return;
 
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, images.data(), height_scale, width_scale,
-        batch, in_height, in_width, channels, out_height, out_width,
-        output.data());
+    ResizeBilinearKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, images.data(), height_scale,
+            width_scale, batch, in_height, in_width, channels, out_height,
+            out_width, output.data());
   }
 };
 
@@ -200,11 +200,11 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     // Accumulate.
     total_count = batch * resized_height * resized_width * channels;
     config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearGradKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input_grad.data(), height_scale,
-        width_scale, batch, original_height, original_width, channels,
-        resized_height, resized_width, output_grad.data());
+    ResizeBilinearGradKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input_grad.data(), height_scale,
+            width_scale, batch, original_height, original_width, channels,
+            resized_height, resized_width, output_grad.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 8f82784d936c05d64317e8f27dd8703502083b9b..bb96c42f10c498d0ec3d6a726728cb1e7bc8f111 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -269,10 +269,10 @@ class ReverseV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
-// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of
-// a single dimension to the dims=3 or dims=2 case, regardless of the number
-// of dimensions in the tensor. This would let some ops use faster
-// lower-dimension code (and use optimized versions).
+      // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
+      // of a single dimension to the dims=3 or dims=2 case, regardless of the
+      // number of dimensions in the tensor. This would let some ops use faster
+      // lower-dimension code (and use optimized versions).
 
 #define HANDLE_REVERSE(NDIMS)                                           \
   case NDIMS:                                                           \
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index b05a7c5550438c6937745df5e58e81630361d64a..3ee49db669faaa85f2eff7a7f119725fc7170dea 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
 #define DEFINE_REVERSE(T, DIM) \
   template struct functor::Reverse<GPUDevice, T, DIM>;
 #define DEFINE_REVERSE_ALL_DIMS(T) \
-  DEFINE_REVERSE(T, 0) \
-  DEFINE_REVERSE(T, 1) \
-  DEFINE_REVERSE(T, 2) \
-  DEFINE_REVERSE(T, 3) \
-  DEFINE_REVERSE(T, 4) \
-  DEFINE_REVERSE(T, 5) \
-  DEFINE_REVERSE(T, 6) \
-  DEFINE_REVERSE(T, 7) \
+  DEFINE_REVERSE(T, 0)             \
+  DEFINE_REVERSE(T, 1)             \
+  DEFINE_REVERSE(T, 2)             \
+  DEFINE_REVERSE(T, 3)             \
+  DEFINE_REVERSE(T, 4)             \
+  DEFINE_REVERSE(T, 5)             \
+  DEFINE_REVERSE(T, 6)             \
+  DEFINE_REVERSE(T, 7)             \
   DEFINE_REVERSE(T, 8)
 
 TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index d1980d4b652ecb507d8745bf64be2395d14920bb..15a707a9c6609e2ac5b790ea519f6c8e523067b1 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -51,8 +51,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
   // Copy seq_len info down for validity checks
   context->eigen_device<Device>().memcpyDeviceToHost(
-      seq_lens_vec.data(), seq_lens_t.data(),
-      sizeof(Tlen) * seq_lens_t.size());
+      seq_lens_vec.data(), seq_lens_t.data(), sizeof(Tlen) * seq_lens_t.size());
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
@@ -76,8 +75,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   }
 }
 
-void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
   const Tensor& seq_lens = context->input(1);
 
@@ -98,13 +96,13 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
 
 template <>
 void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
 template <>
 void CheckErrors<GPUDevice, int64>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
@@ -164,14 +162,15 @@ class ReverseSequenceOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
 };
 
-#define REGISTER_REVERSE_SEQUENCE(type, len_type)                           \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<CPUDevice, type, len_type>);
+#define REGISTER_REVERSE_SEQUENCE(type, len_type)                \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<CPUDevice, type, len_type>);
 
-#define REGISTER_REVERSE_SEQUENCE_LEN(type)    \
-  REGISTER_REVERSE_SEQUENCE(type, int32);      \
+#define REGISTER_REVERSE_SEQUENCE_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE(type, int32);   \
   REGISTER_REVERSE_SEQUENCE(type, int64);
 
 TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN);
@@ -181,23 +180,23 @@ TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                     \
-  template <>                                                               \
-  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(                  \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,      \
-      int32 batch_dim, int32 seq_dim,                                       \
-      typename TTypes<Tlen>::ConstVec seq_lens,                             \
-      typename TTypes<T, Dims>::Tensor output);                             \
+#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                \
+  template <>                                                          \
+  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+      int32 batch_dim, int32 seq_dim,                                  \
+      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
-#define DECLARE_GPU_SPEC_LEN(T, Dims)    \
-  DECLARE_GPU_SPEC(T, int32, Dims);      \
+#define DECLARE_GPU_SPEC_LEN(T, Dims) \
+  DECLARE_GPU_SPEC(T, int32, Dims);   \
   DECLARE_GPU_SPEC(T, int64, Dims);
 
-#define DECLARE_GPU_SPECS(T)     \
-  DECLARE_GPU_SPEC_LEN(T, 2);    \
-  DECLARE_GPU_SPEC_LEN(T, 3);    \
-  DECLARE_GPU_SPEC_LEN(T, 4);    \
+#define DECLARE_GPU_SPECS(T)  \
+  DECLARE_GPU_SPEC_LEN(T, 2); \
+  DECLARE_GPU_SPEC_LEN(T, 3); \
+  DECLARE_GPU_SPEC_LEN(T, 4); \
   DECLARE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
@@ -206,14 +205,15 @@ TF_CALL_bool(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)                       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<GPUDevice, type, len_type>);
-
-#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type)   \
-  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);     \
+#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)            \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<GPUDevice, type, len_type>);
+
+#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);   \
   REGISTER_REVERSE_SEQUENCE_GPU(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN);
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
index cb49f14525a3c54ea46df47fb2edeaa9277dc2d3..4a2136a2cd37f4d549c62396d5e30616a306f84f 100644
--- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
   template class generator::ReverseGenerator<T, Tlen, dims>; \
   template struct functor::ReverseSequence<GPUDevice, T, Tlen, dims>;
 
-#define DEFINE_GPU_SPEC_LEN(T, dims)  \
-  DEFINE_GPU_SPEC(T, int32, dims);    \
+#define DEFINE_GPU_SPEC_LEN(T, dims) \
+  DEFINE_GPU_SPEC(T, int32, dims);   \
   DEFINE_GPU_SPEC(T, int64, dims);
 
-#define DEFINE_GPU_SPECS(T) \
-  DEFINE_GPU_SPEC_LEN(T, 2);    \
-  DEFINE_GPU_SPEC_LEN(T, 3);    \
-  DEFINE_GPU_SPEC_LEN(T, 4);    \
+#define DEFINE_GPU_SPECS(T)  \
+  DEFINE_GPU_SPEC_LEN(T, 2); \
+  DEFINE_GPU_SPEC_LEN(T, 3); \
+  DEFINE_GPU_SPEC_LEN(T, 4); \
   DEFINE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bcbdbee058b4fdb587f2099c54545b8a6aec8ca9
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -0,0 +1,334 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/register_types_traits.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+#define EIGEN_USE_THREADS
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+template <typename T>
+void DoRoll(OpKernelContext* context, const int64 num_elements,
+            const int num_dims, const gtl::ArraySlice<int>& dim_size,
+            const T* input, T* output, const gtl::ArraySlice<int>& threshold,
+            const gtl::ArraySlice<int64>& dim_range) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
+                  int64 start, int64 end) {
+    // array of indices for each dimension
+    gtl::InlinedVector<int, 4> indices(num_dims);
+    int offset = 0;  // the shift along the flattened tensor for current element
+    // initialize indices and offset
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indices[i] = indx;
+      // calculate dimension index after the shift
+      const int shifted_indx = (indx + shift) % dim_size[i];
+      offset += (shifted_indx - indx) * stride;
+    }
+
+    for (int64 i = start; i < end; i++) {
+      output[i + offset] = input[i];
+      // create next combination of indices
+      // while at it adjust offset if needed
+      for (int j = num_dims - 1; j >= 0; j--) {
+        const int indx = (indices[j] + 1) % dim_size[j];
+        indices[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {  // we've reached the threshold
+            // dim_range[j] = threshold[j] + shift[j]
+            // offset = shift[j] + ... other offsets
+            // offset - dim_range[j] = -threshold[j] + ... other offsets
+            // thus we undo our previous offset as well as add a new offset of
+            // -threshold[j] in one operation
+            offset -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          offset += dim_range[j];        // indx became 0 so reverse wrap around
+        }
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  // 15 - expiramentally determined with float and bool types
+  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
+        cost_per_element, std::move(work));
+}
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+// isd - inner shift dimension
+template <typename T>
+// Use memcpy to copy memory in groups when the data type supports memcpy
+void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements,
+                      const int num_dims, const gtl::ArraySlice<int>& dim_size,
+                      const T* input, T* output,
+                      const gtl::ArraySlice<int>& threshold,
+                      const gtl::ArraySlice<int64>& dim_range,
+                      const int64 isd) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
+                  int64 start, int64 end) {
+    // the number of indices over in the flattened tensor you need to skip in
+    // order to make it over from one side of the isd to the other
+    const int64 isd_range = std::max<int>(dim_range[isd], 1);
+    // the distance along the flattend tensor to the next element in the isd
+    const int64 isd_stride = isd_range / std::max<int>(dim_size[isd], 1);
+
+    // start and end represent the i-th group currently so we will convert
+    // them into numbers representing the i-th elements.
+    // there are 2 groups per isd one for all elements before threshold[isd]
+    // and another for all elements after threshold[isd].
+    const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride;
+    const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride;
+    start = (start / 2) * isd_range + start_remainder;
+    end = (end / 2) * isd_range + end_remainder;
+
+    const T* in_ptr = &input[0];
+    T* out_ptr = &output[0];
+    in_ptr += start;
+    out_ptr += start;
+
+    // array of indices for each dimension
+    // indicies = [i, j, k, l, m, n]
+    gtl::InlinedVector<int, 4> indicies(num_dims);
+    // the offset needed to make all inner non-shifting dimensions become 0
+    int64 remainder_offset = 0;
+    // initialize indicies
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indicies[i] = indx;
+      // calculate dimension index after the shift
+      int out_indx = (indx + shift) % dim_size[i];
+      if (i > isd) {
+        // trailing zeroes for indices after the inner shifted dimension
+        out_indx = 0;
+        remainder_offset += (out_indx - indx) * stride;
+      }
+      out_ptr += (out_indx - indx) * stride;
+    }
+    // set trailing zeroes for indices after the inner shifted dimension
+    for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0;
+
+    // the number of indices in the isd dimension the next group will skip
+    // to make it to the next threshold or end point
+    int isd_indx_skip = 0;
+    // the size of the next group
+    int64 group_size = 0;
+    // initialize isd_indx_skip and group_size
+    if (indicies[isd] < threshold[isd]) {
+      isd_indx_skip = threshold[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    } else {
+      isd_indx_skip = dim_size[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    }
+
+    int64 i = start;
+    while (i < end) {
+      // copy group of elements
+      memcpy(out_ptr, in_ptr, group_size * sizeof(T));
+
+      // shift i and the pointers over to the next group position
+      i += group_size;
+      out_ptr += group_size;
+      in_ptr += group_size;
+
+      // produce next combination of indices and adjust the out_ptr position
+      // to fix the offset if necessary
+      // the isd (inner shift dim) should skip to next threshold or endpoint
+      // all dimensions to the left increment by 1 when a digit is carried
+      // all dimensions to the right remain set to 0
+      //            +1 +1 +1 +isd_indx_skip
+      // indicies = [i, j, k, l, 0, 0]
+      //                      ^isd
+      for (int j = isd; j >= 0; j--) {
+        int inc = 1;
+        if (j == isd) inc = isd_indx_skip;
+        const int indx = (indicies[j] + inc) % dim_size[j];
+        indicies[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {
+            out_ptr -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          out_ptr += dim_range[j];       // indx became 0 so reverse wrap around
+        }
+      }
+
+      // set isd_indx_skip and group_size for next iteration
+      if (indicies[isd] < threshold[isd]) {
+        isd_indx_skip = threshold[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      } else {
+        isd_indx_skip = dim_size[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  const int64 ave_group_size = dim_range[isd] / 2;
+  const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1);
+  // 25000 - expiramentally determined with float and bool types
+  const int cost_per_group = 25000 * sizeof(T) * ave_group_size;
+  Shard(worker_threads->num_threads, worker_threads->workers, total_work,
+        cost_per_group, std::move(work));
+}
+
+template <typename Device, typename T, typename Tshift, typename Taxis>
+class RollOp : public OpKernel {
+ public:
+  explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input = context->input(0);
+    const Tensor& shift = context->input(1);
+    const Tensor& axis = context->input(2);
+
+    auto shift_flat = shift.flat<Tshift>();
+    auto axis_flat = axis.flat<Taxis>();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument("input must be 1-D or higher"));
+    OP_REQUIRES(context, shift.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "shift must be a scalar or a 1-D vector. Found: ",
+                    shift.shape().DebugString()));
+    OP_REQUIRES(context, axis.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "axis must be a scalar or a 1-D vector. Found: ",
+                    axis.shape().DebugString()));
+    OP_REQUIRES(
+        context, shift.shape() == axis.shape(),
+        errors::InvalidArgument("shift and axis must have the same size"));
+    const int64 num_elements = input.NumElements();
+    const int num_shifts = static_cast<int>(shift_flat.size());
+    const int num_dims = input.dims();
+
+    // if there are any duplicate axes, shift_mod_sum will have the
+    // total modulo sum of shifts for each dimension
+    gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
+    for (int i = 0; i < num_shifts; i++) {
+      const int axis = axis_flat(i);
+      OP_REQUIRES(context, axis < num_dims,
+                  errors::InvalidArgument("axis ", axis, " is out of range"));
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
+      const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
+      // modulo that works with negatives: ((x % y) + y) % y
+      shift_mod_sum[axis] = (sum % ds + ds) % ds;
+    }
+    // the size of each dimension
+    gtl::InlinedVector<int, 4> dim_size(num_dims);
+    // threshold[i] is the index that the roll starts to wrap back to the front
+    gtl::InlinedVector<int, 4> threshold(num_dims);
+    // dim_range is the number of indices over in the flattened tensor
+    // you need to skip in order to make it over from one side of a dimension
+    // to the other. Used to make the shifts wrap around after a threshold.
+    gtl::InlinedVector<int64, 4> dim_range(num_dims);
+    int64 dim_size_prod = 1;  // dimension size product
+    // inner shift dimension (inner most shifted dimension)
+    int64 isd = 0;
+    for (int i = num_dims - 1; i >= 0; i--) {
+      if (isd == 0 && shift_mod_sum[i] != 0) isd = i;
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(i)), 1);
+      dim_size[i] = ds;
+      threshold[i] = (ds - shift_mod_sum[i]) % ds;
+      dim_size_prod *= static_cast<int64>(input.dim_size(i));
+      dim_range[i] = dim_size_prod;
+    }
+
+    Tensor* output = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    auto input_flat = input.flat<T>().data();
+    auto output_flat = output->flat<T>().data();
+
+    if (std::is_same<Device, CPUDevice>::value) {
+      if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        // V2 copies memory in groups instead of element by element
+        DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size,
+                            input_flat, output_flat, threshold, dim_range, isd);
+      } else {
+        // incase memcpy does not work for current data type
+        DoRoll<T>(context, num_elements, num_dims, dim_size, input_flat,
+                  output_flat, threshold, dim_range);
+      }
+    }
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int64>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int64>)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90b6f8d0f3094224ca694b59c851c14bb424d120
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -0,0 +1,484 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class RollOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType data_type, DataType index_type) {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "Roll")
+                     .Input(FakeInput(data_type))
+                     .Input(FakeInput(index_type))
+                     .Input(FakeInput(index_type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RollOpTest, ScalarIndices) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+  test::FillValues<float>(&expected, {2, 3, 4, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5}));
+  test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_Complex) {
+  MakeOp(DT_COMPLEX64, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<std::complex<float>>(
+      TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+                         std::complex<float>(2, 12), std::complex<float>(3, 13),
+                         std::complex<float>(4, 14)});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5}));
+  test::FillValues<std::complex<float>>(
+      &expected, {std::complex<float>(2, 12), std::complex<float>(3, 13),
+                  std::complex<float>(4, 14), std::complex<float>(0, 10),
+                  std::complex<float>(1, 11)});
+  test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
+                                       "o", "k", "b", "c", "d", "e", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 3}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 3}));
+  test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
+                                       "k", "o", "m", "n", "c", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4, 1, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3}));
+  test::FillValues<float>(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({4, 1, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3}));
+  test::FillValues<string>(
+      &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({1, 1, 1}), {5});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1}));
+  test::FillValues<float>(&expected, {5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1}));
+  test::FillValues<string>(&expected, {"a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
+                                       "e", "a", "g", "h", "i", "j", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({}), {7});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("axis must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1}), {1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift and axis must have the same size"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+}
+
+// isd - (inner shift dimension) The inner most dimension to be shifted.
+//    All outer dimensions will also be shifted for testing.
+static Graph* RollGraph(const TensorShape& shape, int isd) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor input(DT_FLOAT, shape);
+  input.flat<float>().setRandom();
+  const int dims = static_cast<int>(input.dims());
+  Tensor shift(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    // shift the inner shift dimension and all outer dimensions
+    shift.flat<int32>()(i) = (i <= isd) ? 2 : 0;
+  }
+  Tensor axis(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    axis.flat<int32>()(i) = i;
+  }
+  test::graph::Roll(g, test::graph::Constant(g, input),
+                    test::graph::Constant(g, shift),
+                    test::graph::Constant(g, axis));
+  return g;
+}
+
+#define BM_ROLL_OUTER(DEVICE)                                                 \
+  static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) {    \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_outer)                                         \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+#define BM_ROLL_ALL(DEVICE)                                                   \
+  static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) {      \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_all)                                           \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+BM_ROLL_OUTER(cpu);
+BM_ROLL_ALL(cpu);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index df60eda75978ff9f6a9d7059b9594f86831aa6f5..990bd2bff94ac9cf18dd6f6316503890bb31884d 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -106,11 +106,11 @@ void SaveTensors(
       OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
                                   shape_spec, &shape, &slice, &slice_shape));
       OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()),
-                  errors::InvalidArgument("Slice in shape_and_slice "
-                                          "specification does not match the "
-                                          "shape of the tensor to  save: ",
-                                          shape_spec, ", tensor: ",
-                                          input.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "Slice in shape_and_slice "
+                      "specification does not match the "
+                      "shape of the tensor to  save: ",
+                      shape_spec, ", tensor: ", input.shape().DebugString()));
     }
 
 #define WRITER_ADD(T)                                           \
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index c6e35fe329e1c1b7acb62daedeeb2f1a92444b78..079f15e101308867389745ee42146086af91c47c 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -29,7 +29,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace scatter_op {
 
@@ -117,7 +117,7 @@ struct AssignSYCL<scatter_op::UpdateOp::DIV> {
     p.device(d) = p / u;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace scatter_op
@@ -156,7 +156,7 @@ struct ScatterFunctorBase {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
+struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
   Index operator()(OpKernelContext* c, const SYCLDevice& d,
                    typename TTypes<T>::Matrix params,
                    typename TTypes<T>::ConstMatrix updates,
@@ -171,13 +171,13 @@ struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(d, params.template chip<0>(index),
-                                            updates.template chip<0>(i));
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), updates.template chip<0>(i));
     }
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
@@ -217,7 +217,7 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
-        : ScatterFunctorBase<CPUDevice, T, Index, op>{};
+    : ScatterFunctorBase<CPUDevice, T, Index, op> {};
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
@@ -239,7 +239,7 @@ struct ScatterFunctorSYCL {
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index e116077d3cfc37871009ee3fede633590d269681..be18658543ea330e3196d0f372154df32e4e1dfc 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -30,9 +30,10 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
-__global__ void ScatterOpCustomKernel(
-    T* params, const T* updates, const Index* indices,
-    Index first_dim_size, Index updates_size, Index indices_size) {
+__global__ void ScatterOpCustomKernel(T* params, const T* updates,
+                                      const Index* indices,
+                                      Index first_dim_size, Index updates_size,
+                                      Index indices_size) {
   Index update_block = updates_size / indices_size;
   CUDA_1D_KERNEL_LOOP(i, updates_size) {
     int indices_i = i / update_block;
@@ -85,8 +86,8 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
     ScatterOpCustomKernel<T, Index, op>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), updates.data(), indices.data(),
-            first_dim_size, updates_size, indices_size);
+            params.data(), updates.data(), indices.data(), first_dim_size,
+            updates_size, indices_size);
     return -1;
   }
 };
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index c6c9d4e6588f1f4d847810de1e736220d5572f25..e82660dcc1dcf9dbb7d531c0223e211ce46a8635 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -40,7 +40,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -251,7 +251,7 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32);
 #undef REGISTER_SCATTER_ND_INDEX_SYCL
 #undef REGISTER_SCATTER_ND_FULL_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 31f74671cabdabce2884fcae61a6e56dbfdefe8b..a3c21edc15f684e51c7f1806aeeeeead679ea22e 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -55,6 +55,27 @@ struct LeftUpdate<T, scatter_nd_op::UpdateOp::SUB> {
   }
 };
 
+// Specializations for std::complex, updating real and imaginary part
+// individually. Even though this is not an atomic op anymore, it is safe
+// because there is only one type of op per kernel.
+template <typename T>
+struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
+      std::complex<T>* out, const std::complex<T>& val) {
+    T* ptr = reinterpret_cast<T*>(out);
+    CudaAtomicAdd(ptr, val.real());
+    CudaAtomicAdd(ptr, val.imag());
+  }
+};
+
+template <typename T>
+struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::SUB> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
+      std::complex<T>* out, const std::complex<T>& val) {
+    LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD>()(out, -val);
+  }
+};
+
 }  // namespace
 
 template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 8607c7f95af79c8f581768cfc698bad9fe085188..282165349f316144d261859d5a3a992f047e0df3 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -33,7 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
@@ -102,11 +102,12 @@ class ScatterUpdateOp : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -137,7 +138,7 @@ class ScatterUpdateOp : public OpKernel {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
+class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
  public:
   explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
@@ -165,11 +166,12 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -206,7 +208,7 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 863c123b43f781239dab62e6b57719376fc49dad..066a4b80a2bc6976a6c95ced2c5efecbef13eeba 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -37,9 +37,8 @@ void FeatureWeightsDenseStorage::UpdateDenseDeltaWeights(
   const size_t num_weight_vectors = normalized_bounded_dual_delta.size();
   if (num_weight_vectors == 1) {
     deltas_.device(device) =
-        deltas_ +
-        dense_vector.RowAsMatrix() *
-            deltas_.constant(normalized_bounded_dual_delta[0]);
+        deltas_ + dense_vector.RowAsMatrix() *
+                      deltas_.constant(normalized_bounded_dual_delta[0]);
   } else {
     // Transform the dual vector into a column matrix.
     const Eigen::TensorMap<Eigen::Tensor<const double, 2, Eigen::RowMajor>>
@@ -61,9 +60,8 @@ void FeatureWeightsSparseStorage::UpdateSparseDeltaWeights(
     const Example::SparseFeatures& sparse_features,
     const std::vector<double>& normalized_bounded_dual_delta) {
   for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
-    const double feature_value = sparse_features.values == nullptr
-                                     ? 1.0
-                                     : (*sparse_features.values)(k);
+    const double feature_value =
+        sparse_features.values == nullptr ? 1.0 : (*sparse_features.values)(k);
     auto it = indices_to_id_.find((*sparse_features.indices)(k));
     for (size_t l = 0; l < normalized_bounded_dual_delta.size(); ++l) {
       deltas_(l, it->second) +=
@@ -122,23 +120,24 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
   }
 
   // Reads in the weights, and allocates and initializes the delta weights.
-  const auto initialize_weights = [&](
-      const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
-      std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
-    for (int i = 0; i < weight_inputs.size(); ++i) {
-      Tensor* delta_t;
-      TF_RETURN_IF_ERROR(
-          weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
-      // Convert the input vector to a row matrix in internal representation.
-      auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
-      deltas.setZero();
-      feature_weights->emplace_back(
-          FeatureWeightsDenseStorage{weight_inputs[i].shaped<float, 2>(
-                                         {1, weight_inputs[i].NumElements()}),
-                                     deltas});
-    }
-    return Status::OK();
-  };
+  const auto initialize_weights =
+      [&](const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
+          std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
+        for (int i = 0; i < weight_inputs.size(); ++i) {
+          Tensor* delta_t;
+          TF_RETURN_IF_ERROR(
+              weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
+          // Convert the input vector to a row matrix in internal
+          // representation.
+          auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
+          deltas.setZero();
+          feature_weights->emplace_back(FeatureWeightsDenseStorage{
+              weight_inputs[i].shaped<float, 2>(
+                  {1, weight_inputs[i].NumElements()}),
+              deltas});
+        }
+        return Status::OK();
+      };
 
   return initialize_weights(dense_weights_inputs, &dense_weights_outputs,
                             &dense_weights_);
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 9f072700754320700024be57ebe3c4ca780a1ae9..45915693ac6f0b4ad2d5f2aacebcd4aa34c03439 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -149,7 +149,8 @@ class Example {
   // 1.0f.
   struct SparseFeatures {
     std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
-    std::unique_ptr<TTypes<const float>::UnalignedConstVec> values;  // nullptr encodes optional.
+    std::unique_ptr<TTypes<const float>::UnalignedConstVec>
+        values;  // nullptr encodes optional.
   };
 
   // A dense vector which is a row-slice of the underlying matrix.
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 0f5c2424b38aeed5912287bba7a218575a107073..dbe0177dda337a271433cd3bb4257026dc702364 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -57,11 +57,11 @@ namespace tensorflow {
 
 namespace {
 
-using sdca::Regularizations;
 using sdca::Example;
 using sdca::Examples;
 using sdca::ExampleStatistics;
 using sdca::ModelWeights;
+using sdca::Regularizations;
 
 struct ComputeOptions {
   explicit ComputeOptions(OpKernelConstruction* const context) {
@@ -76,8 +76,9 @@ struct ComputeOptions {
     } else if (loss_type == "smooth_hinge_loss") {
       loss_updater.reset(new SmoothHingeLossUpdater);
     } else {
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unsupported loss type: ", loss_type));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unsupported loss type: ", loss_type));
     }
     OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative));
     OP_REQUIRES_OK(
@@ -90,9 +91,10 @@ struct ComputeOptions {
         context, num_sparse_features + num_dense_features > 0,
         errors::InvalidArgument("Requires at least one feature to train."));
 
-    OP_REQUIRES(context, static_cast<int64>(num_sparse_features) +
-                                 static_cast<int64>(num_dense_features) <=
-                             std::numeric_limits<int>::max(),
+    OP_REQUIRES(context,
+                static_cast<int64>(num_sparse_features) +
+                        static_cast<int64>(num_dense_features) <=
+                    std::numeric_limits<int>::max(),
                 errors::InvalidArgument(
                     strings::Printf("Too many feature groups: %lld > %d",
                                     static_cast<int64>(num_sparse_features) +
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 3ef1cd1e062b5f5abecca2f4f788e3fed20e33e9..27b8081eb88a13c68d434e82c2e59d1aea068b78 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -115,7 +115,7 @@ class SegmentReductionOp : public OpKernel {
     Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce;
     dims_to_reduce[0] = 0;
 #else
-    Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce;
+    Eigen::IndexList<Eigen::type2index<0> > dims_to_reduce;
 #endif
     Index start = 0, end = 1;
 
@@ -359,7 +359,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 namespace functor {
 
 // UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and UnsortedSegmentMaxFunctor.
+// todo: Remove duplicate code in UnsortedSegmentSumFunctor and
+// UnsortedSegmentMaxFunctor.
 template <typename T, typename Index>
 struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
     : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
@@ -461,9 +462,10 @@ class UnsortedSegmentBaseOp : public OpKernel {
 
     auto data_ptr = data.template flat<T>().data();
     reduction_functor_(context, context->template eigen_device<Device>(),
-                     output_rows, segment_ids.shape(), segment_flat,
-                     data.NumElements(), data_ptr, output_flat);
+                       output_rows, segment_ids.shape(), segment_flat,
+                       data.NumElements(), data_ptr, output_flat);
   }
+
  private:
   functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
 };
@@ -472,22 +474,20 @@ template <typename Device, class T, class Index>
 class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            sum_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, sum_functor_) {}
+
  private:
-    functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+  functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
 };
 
 template <typename Device, class T, class Index>
 class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            max_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, max_functor_) {}
+
  private:
-    functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
+  functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
 };
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
@@ -663,9 +663,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
           Reduce(input_flat, indices_vec, start, end - start, out);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
-                      "Bad: indices[", start + bad_offset, "] == ",
-                      indices_vec(start + bad_offset), " out of range [0, ",
-                      input_flat.dimension(0), ")"));
+                      "Bad: indices[", start + bad_offset,
+                      "] == ", indices_vec(start + bad_offset),
+                      " out of range [0, ", input_flat.dimension(0), ")"));
 
       start = end;
       ++end;
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index bcdd42c80c18af381988808db74319e5072f38a7..5c9cfe090656ff043b952192b7a4d6e8a80b692f 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -51,13 +51,14 @@ struct SegmentSumFunctor {
 // BaseFunctor for definition of UnsorteSegmentReductionOp
 // for usage without templates.
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor{
-  virtual ~UnsortedSegmentBaseFunctor(){}
+struct UnsortedSegmentBaseFunctor {
+  virtual ~UnsortedSegmentBaseFunctor() {}
   virtual void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output){};
+                          const Index output_rows,
+                          const TensorShape& segment_ids_shape,
+                          typename TTypes<Index>::ConstFlat segment_ids,
+                          const Index data_size, const T* data,
+                          typename TTypes<T, 2>::Tensor output){};
 };
 
 // Functor for UnsortedSegmentSumOp.
@@ -70,7 +71,8 @@ struct UnsortedSegmentBaseFunctor{
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentSumFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -88,7 +90,8 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, I
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentMaxFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 159fada621bd88de259e9b044491f3ecebf10b19..39d520698e1910a432de29b747a223f9e8033d24 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -194,7 +194,8 @@ void SegmentSumFunctor<T, Index>::operator()(
 
 // UnsortedSegmentSumFunctor implementation for GPUDevice.
 template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
+struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
+    : UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -221,11 +222,10 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
     const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
 
     config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<
-        T,
-        Index><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        input_outer_dim_size, input_inner_dim_size, output_rows,
-        segment_ids.data(), data, output.data());
+    UnsortedSegmentSumCustomKernel<T, Index>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input_outer_dim_size, input_inner_dim_size, output_rows,
+            segment_ids.data(), data, output.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index 97657807268d30d66a01573bc3df09e318ce1d51..bcd88773902824c6e88db4226af43993d5649007 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 206fd40fa68c3158fa60b7651d40121ab1344bbd..688e61fcadc3ad01b579f8dfc712af2d8032ee35 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -198,7 +198,7 @@ REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
@@ -207,6 +207,6 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index e2e3758d87e49702ebc48f78c022affe49a3b7e4..9db0bd4d98bdb9964cb561d96d91782ba3615a7f 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -53,13 +53,13 @@ class RangeOp : public OpKernel {
     if (delta > 0) {
       OP_REQUIRES(
           context, start <= limit,
-          errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start <= limit when delta > 0: ", start, "/", limit));
     } else {
       OP_REQUIRES(
           context, start >= limit,
-          errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start >= limit when delta < 0: ", start, "/", limit));
     }
     int64 size = (std::is_integral<T>::value
                       ? ((std::abs(limit - start) + std::abs(delta) - 1) /
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 185c5b248fca8f5a4e8edf6d46e9447f8a0b4750..f2dd2812b53e2c974efac3d3e1aef1052d907da6 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -144,7 +144,7 @@ REGISTER_GPU_KERNEL(bool);
 TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class DeleteSessionTensorOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 8d9d0ea84612b51bdcd597698b89e3b8ffb8a915..55be308901b2b1233090c097944f441a17938125 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -235,10 +235,10 @@ class SqueezeOp : public OpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument("Tried to explicitly squeeze "
-                                              "dimension ",
-                                              i, " but dimension was not 1: ",
-                                              existing_dim));
+                      errors::InvalidArgument(
+                          "Tried to explicitly squeeze "
+                          "dimension ",
+                          i, " but dimension was not 1: ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 82595de77947fab01a2107e009982f6db96601e5..79369fd4a9cc1668bc12cfdb466ad2ec2bbe8d11 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -58,7 +58,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
@@ -72,10 +72,11 @@ static void SharedValidation(OpKernelContext* context,
   const Tensor& size_tensor = context->input(2);
 
   OP_REQUIRES(
-      context, context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
-                   context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
-                   begin_tensor.NumElements() == input.dims() &&
-                   size_tensor.NumElements() == input.dims(),
+      context,
+      context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
+          context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
+          begin_tensor.NumElements() == input.dims() &&
+          size_tensor.NumElements() == input.dims(),
       errors::InvalidArgument(
           "Expected begin and size arguments to be 1-D tensors of size ",
           input.dims(), ", but got shapes ", begin_tensor.shape().DebugString(),
@@ -125,8 +126,7 @@ static void SharedSliceCommonCases(OpKernelContext* context,
                                    TensorShape* output_shape,
                                    gtl::InlinedVector<int64, 4>* begin,
                                    gtl::InlinedVector<int64, 4>* size,
-                                   Tensor** result,
-                                   bool* done) {
+                                   Tensor** result, bool* done) {
   bool is_identity = true;
   bool slice_dim0 = true;
   *done = false;
@@ -142,8 +142,8 @@ static void SharedSliceCommonCases(OpKernelContext* context,
     return;
   }
 
-  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
-                                          (*size)[0])) {
+  if (slice_dim0 &&
+      IsDim0SliceAligned<T>(input.shape(), (*begin)[0], (*size)[0])) {
     VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
     CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
     context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
@@ -154,7 +154,6 @@ static void SharedSliceCommonCases(OpKernelContext* context,
   OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
 }
 
-
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -206,8 +205,9 @@ class SliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -280,8 +280,9 @@ class MklSliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -292,9 +293,9 @@ class MklSliceOp : public OpKernel {
   // as the sizes of all the dimensions of the input except slice_dim, then
   // returns True. Otherwise, returns False.
   bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int slice_dim) {
+                                          const gtl::ArraySlice<int64>& begin,
+                                          const gtl::ArraySlice<int64>& size,
+                                          int slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (dim != slice_dim &&
           (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
@@ -316,9 +317,9 @@ class MklSliceOp : public OpKernel {
   // Returns True if Slicing over a single dimension, and sets slice_dim
   // to the number of the dimension that satisfies criteria.
   bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int* slice_dim) {
+                                    const gtl::ArraySlice<int64>& begin,
+                                    const gtl::ArraySlice<int64>& size,
+                                    int* slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
         *slice_dim = dim;
@@ -329,8 +330,7 @@ class MklSliceOp : public OpKernel {
   }
 
   template <int NDIM>
-  void HandleCase(OpKernelContext* context,
-                  const gtl::ArraySlice<int64>& begin,
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
     TensorShape in_shape = context->input(0).shape();
@@ -340,67 +340,63 @@ class MklSliceOp : public OpKernel {
     // format over channel dimension.
     if (NDIM == 4 &&
         DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
-        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
-                                          in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(3),
-                                 (size_t) 1
-                               };
-
-        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
-                                  (size_t) size[2] * size[3],
-                                  (size_t) size[3],
-                                  (size_t) 1 };
-
-        T *in_buf = const_cast<T*>(const_cast<const T*>(
-                    context->input(0).flat<T>().data()));
-        T *op_buf = result->flat<T>().data();
-
-        if (slice_dim == 1) {
-          /* data format = NCHW */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip  = in_buf + (d0 * in_strides[0]);
-              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              // For NCHW, H and W will be contiguous. So we can copy
-              // both with one memcpy.
-              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
-                     sizeof(T) * in_strides[1]);
-            }
+      size_t in_strides[4] = {
+          (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) *
+              in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(2) * in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(3), (size_t)1};
+
+      size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3],
+                               (size_t)size[2] * size[3], (size_t)size[3],
+                               (size_t)1};
+
+      T* in_buf = const_cast<T*>(
+          const_cast<const T*>(context->input(0).flat<T>().data()));
+      T* op_buf = result->flat<T>().data();
+
+      if (slice_dim == 1) {
+        /* data format = NCHW */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+            // For NCHW, H and W will be contiguous. So we can copy
+            // both with one memcpy.
+            memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
+                   sizeof(T) * in_strides[1]);
           }
-          return;
-        } else if (slice_dim == 3) {
-          /* data_format = NHWC */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip = in_buf + (d0 * in_strides[0]);
-              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              #pragma omp parallel for
-              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
-                T *ip2 = ip1 + (d2 * in_strides[2]);
-                T *ip3 = ip2 + begin[3];
-                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
-                T *op3 = op2;
-                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
-                       sizeof(T) * size[3]);
-              }
+        }
+        return;
+      } else if (slice_dim == 3) {
+        /* data_format = NHWC */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+#pragma omp parallel for
+            for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+              T* ip2 = ip1 + (d2 * in_strides[2]);
+              T* ip3 = ip2 + begin[3];
+              T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
+              T* op3 = op2;
+              memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
+                     sizeof(T) * size[3]);
             }
           }
-          return;
         }
-        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+        return;
+      }
+      // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
     }
 
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
@@ -535,13 +531,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                 \
-  template <>                                                      \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                     \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output,\
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                                  \
+  template <>                                                       \
+  void Slice<SYCLDevice, T, NDIM>::operator()(                      \
+      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                  \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,        \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);         \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index 0362a021336f633b88a666c68f42fa5082f4f66d..db7eded745eb0d3c880dc46d164aad31b2531829 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -24,7 +24,6 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-
 template <typename Device, typename T, int NDIMS>
 struct Slice {
   void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index 47f1d5342a9e56301dabad2eb9700ce97d45695d..64b6948190a23b554582975d38dae8be638840fa 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -43,7 +43,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS);
 DEFINE_SYCL_KERNELS(int32);
 
 #undef DEFINE_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 590f01c4691f479cbf90971b368656ff3c78c91a..e1712ac239d6be2d51b0c0598a799959a8b53a94 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
@@ -48,7 +48,7 @@ struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -100,5 +100,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
     SoftmaxOp<SYCLDevice, double>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index c25ce2d8bb5ee5fe50034e74f0362fd6b0e79589..92ddf8edbfbe5e3c8fbc2c3b5ddeddd847838814 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -70,7 +70,7 @@ static Graph* ConstructSpaceToBatchGraph(
   }                                                                                                     \
   BENCHMARK(                                                                                            \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
-#define BM_SpaceToBatch(OP, ...)                      \
+#define BM_SpaceToBatch(OP, ...)                                 \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__));  \
diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc
index 23d8a5f9ed4483c0e7d5c15108db6cbbdbe0890a..4c374b8d99444023c14fcb4ed770a5c263535be0 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor.cc
@@ -154,7 +154,7 @@ struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index 06813650c08ec26a38edfe2ba01440a2fb8066fc..f46a84da1e951113382e4d44b44463c2a621ca10 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -44,7 +44,7 @@ constexpr int kMaxSpaceToBatchBlockDims = 4;
   MACRO(2 /**/, ##__VA_ARGS__)                              \
   MACRO(3 /**/, ##__VA_ARGS__)                              \
   MACRO(4 /**/, ##__VA_ARGS__)                              \
-/**/
+  /**/
 
 namespace internal {
 namespace spacetobatch {
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index db8d419c38ff5f8a06a1aafde14076b55b7c75e6..5687141c9eaeec11498c1d2cc954155bd9e05856 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -141,10 +141,10 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
     }
     CudaLaunchConfig config =
         GetCudaLaunchConfig(static_cast<int32>(total_count), d);
-    S2B<T, NUM_BLOCK_DIMS,
-        B2S><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, const_cast<T*>(space_tensor.data()), args,
-        const_cast<T*>(batch_tensor.data()));
+    S2B<T, NUM_BLOCK_DIMS, B2S>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, const_cast<T*>(space_tensor.data()),
+            args, const_cast<T*>(batch_tensor.data()));
     return Status::OK();
   }
 };
@@ -153,7 +153,7 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 95c1f5e7e8ca978fda334396538de0cf4ed5b774..fdc08ec8e3bfd128a3e341efab8e5ba319c90e4f 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -58,9 +58,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
-                           block_dims == orig_paddings.dim_size(0) &&
-                           2 == orig_paddings.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
+                  block_dims == orig_paddings.dim_size(0) &&
+                  2 == orig_paddings.dim_size(1),
               errors::InvalidArgument("paddings should have shape [",
                                       block_dims, ", 2] instead of ",
                                       orig_paddings.shape().DebugString()));
diff --git a/tensorflow/core/kernels/sparse_add_grad_op.cc b/tensorflow/core/kernels/sparse_add_grad_op.cc
index d8ed0c6f0c20d13d5e7870159ed1569514333c5e..8597f3a8f7307584d27a265bc8df8949f20898b6 100644
--- a/tensorflow/core/kernels/sparse_add_grad_op.cc
+++ b/tensorflow/core/kernels/sparse_add_grad_op.cc
@@ -35,9 +35,10 @@ class SparseAddGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
     OP_REQUIRES_OK(ctx, ctx->input("sum_indices", &sum_indices));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(sum_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(sum_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -49,8 +50,9 @@ class SparseAddGradOp : public OpKernel {
             "Input backprop_val_grad should be a vector but received shape: ",
             backprop_val_grad->shape().DebugString()));
     OP_REQUIRES(
-        ctx, a_indices->dim_size(1) == b_indices->dim_size(1) &&
-                 b_indices->dim_size(1) == sum_indices->dim_size(1),
+        ctx,
+        a_indices->dim_size(1) == b_indices->dim_size(1) &&
+            b_indices->dim_size(1) == sum_indices->dim_size(1),
         errors::InvalidArgument("The densified operands should have the same "
                                 "ndims; for A, B, sum got: ",
                                 a_indices->dim_size(1), b_indices->dim_size(1),
diff --git a/tensorflow/core/kernels/sparse_add_op.cc b/tensorflow/core/kernels/sparse_add_op.cc
index bd91dfdce64cbfc697345e0f0c7278de938ecc5b..d16317af671dd6592d3e30ac52941508c4ffd088 100644
--- a/tensorflow/core/kernels/sparse_add_op.cc
+++ b/tensorflow/core/kernels/sparse_add_op.cc
@@ -34,8 +34,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -46,8 +47,9 @@ class SparseAddOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t));
     OP_REQUIRES_OK(ctx, ctx->input("b_values", &b_values_t));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Input values should be vectors but received shapes: ",
                     a_values_t->shape().DebugString(), " and ",
@@ -62,8 +64,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
     OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 4cad02bbee8dd20328bac3ec24074c22493009b8..1f08e6c5ce2e8a40cf464760434f9161015b643c 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -61,9 +61,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
index c122616cf15b8567494a604337951c8d278f5ead..80bc1f19344dffadaae864f64c98d1f15addd1fb 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
@@ -103,8 +103,9 @@ class SparseAccumulatorTakeGradientOp
                       DoneCallback callback) override {
     // Check signature
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                 {DT_INT64, accumulator->dtype(), DT_INT64}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+                            {DT_INT64, accumulator->dtype(), DT_INT64}),
         callback);
   }
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 07d935d55fe06150309736ba0fec88091ed007c6..7cd4532ad63812d905ceb6b96291aa50293070ef 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -288,8 +288,7 @@ struct CrossTraits<true, int64> {
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
-  explicit SparseCrossOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit SparseCrossOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
     // Read signed_hash_key_ as int64 since uint64 attributes are not
     // supported by REGISTER_OP.
@@ -316,8 +315,8 @@ class SparseCrossOp : public OpKernel {
         GenerateColumnsFromInput(indices_list_in, values_list_in,
                                  shapes_list_in, dense_list_in);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
-        crosser(columns, num_buckets_, hash_key_);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
+        columns, num_buckets_, hash_key_);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -326,8 +325,8 @@ class SparseCrossOp : public OpKernel {
     CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
                         &shape_out, &output_start_indices);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
-        updater(output_start_indices, indices_out, values_out);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(
+        output_start_indices, indices_out, values_out);
     auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
       for (int b = begin; b < end; b++) {
         ProductIterator<InternalType> product_iterator(columns, b);
@@ -381,8 +380,9 @@ class SparseCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index cc0f86ce05e613767b22d51875f90e8391504bdb..ac48202ada2204ea36478257630f20f7892be50b 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -70,8 +70,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values_t->shape()) &&
-                         TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
@@ -150,8 +151,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
       CASE(4);
       CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef CASE
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index eaf1884243ec19689af783e29adaee886e7498d6..fe198af7e6c131ab19daf877063a2a6838d1f2c7 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -96,9 +96,9 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [2    ]  cdiv [dense: same shape, all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -125,9 +125,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [2    ]  cdiv [dense: shape [3,1], all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -152,9 +152,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [2    ]  cdiv [dense: shape [2]]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
@@ -184,9 +184,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1   ?]  where ? remains implicitly zero.
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 8ab23b64d3d94c604ae027bbfd75357a4e2e284b..a1f9667b783ca5f455523874bc4e342f1368d4f3 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -159,8 +159,8 @@ struct SparseSlice {
 
 template <typename T>
 template <bool Transpose>
-void SparseSlice<T>::Initialize(const typename SparseSlice<T>::ConstMatrixMap& mat,
-                                int col_offset) {
+void SparseSlice<T>::Initialize(
+    const typename SparseSlice<T>::ConstMatrixMap& mat, int col_offset) {
   const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
   const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
   DCHECK_LE(num_rows, mat_rows);
@@ -278,9 +278,9 @@ ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) {
   float out = 0;
   auto tmp = reinterpret_cast<bfloat16*>(&out);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    tmp[0] = *src;
+  tmp[0] = *src;
 #else
-    tmp[1] = *src;
+  tmp[1] = *src;
 #endif
   return out;
 }
@@ -970,9 +970,9 @@ class SparseMatMulOp : public OpKernel {
     const int k2 = transpose_b_ ? b.dim_size(1) : b.dim_size(0);
 
     OP_REQUIRES(ctx, k == k2,
-                errors::InvalidArgument("Matrix size incompatible: a: ",
-                                        a.shape().DebugString(), ", b: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size incompatible: a: ", a.shape().DebugString(),
+                    ", b: ", b.shape().DebugString()));
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
 
@@ -1224,8 +1224,9 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src,
 
 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
-    int slice_num_rows, int slice_col_start, int slice_num_cols, const int N,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat,
+    int slice_row_start, int slice_num_rows, int slice_col_start,
+    int slice_num_cols, const int N,
     const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) {
   DCHECK_EQ(N % 2, 0);
   DCHECK_LE(kNumOperands * sizeof(float) / sizeof(TR), N);
@@ -1306,8 +1307,9 @@ inline std::unique_ptr<BlockingCounter> SparseMatMul<TL, TR>::CreateDenseSlices(
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
     const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
-    int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) {
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right,
+    bool transpose_left, int num_threads, int* KR, int* NR, int* KL, int* JB,
+    int* IB) {
   // Heuristics for calculating block sizes
   // Assume two hyperthreads per core.
   const int est_num_cores = std::max(1, (num_threads + 1) / 2);
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index cca52558ae25a7a0840d8551440f68ccc5ec2277..14ef2ed7044a796dff67e287230d955e32ca62cd 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -159,25 +159,25 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
 // Return a packet with the first value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 0);
+  return vec_splat(a, 0);
 }
 
 // Return a packet with the second value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 1);
+  return vec_splat(a, 1);
 }
 
 // Return a packet with the third value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 2);
+  return vec_splat(a, 2);
 }
 
 // Return a packet with the fourth value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 3);
+  return vec_splat(a, 3);
 }
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index f815ca9e344664c4c95befccb88e750eb99d0eaf..ebc6d8fa4ec5422925e57c25856e0007702299b1 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -284,11 +284,11 @@ class SparseMatmulOpTest : public ::testing::Test {
       uint16_t* data3_bfloat16_p =
           reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            data3_p[1] = 0;
-            data3_bfloat16_p[0] = data3_p[0];
+      data3_p[1] = 0;
+      data3_bfloat16_p[0] = data3_p[0];
 #else
-            data3_p[0] = 0;
-            data3_bfloat16_p[0] = data3_p[1];
+      data3_p[0] = 0;
+      data3_bfloat16_p[0] = data3_p[1];
 #endif
     }
   }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 110376be42573fe31cc1a13306c80e5050477f03..96246c7a71272bf638523fafb548b7e802f09039 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -51,9 +51,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
@@ -93,9 +93,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc
index 327a94b8a12e1d8568c5ca79263cc6eb78501d15..444a5f657a969290d9cc67d88c500a49a0971282 100644
--- a/tensorflow/core/kernels/sparse_softmax_op.cc
+++ b/tensorflow/core/kernels/sparse_softmax_op.cc
@@ -50,8 +50,9 @@ class SparseSoftmaxOp : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()) &&
-                             TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
index b027adba6b384c63d119387b5b13122fb1c25b12..09cb2a6a71c7c0f0ebc9cbc2e7b1951705890a41 100644
--- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
@@ -132,14 +132,16 @@ class SparseSparseBinaryOpShared : public OpKernel {
 
     // Validations.
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
-                 TensorShapeUtils::IsMatrix(b_indices_t->shape()),
+        ctx,
+        TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
+            TensorShapeUtils::IsMatrix(b_indices_t->shape()),
         errors::InvalidArgument("Inputs a_indices and b_indices should be "
                                 "matrices but received shapes: ",
                                 a_indices_t->shape().DebugString(), ", ",
                                 b_indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Inputs a_values and b_values should be vectors "
                     "but received shapes: ",
@@ -157,8 +159,9 @@ class SparseSparseBinaryOpShared : public OpKernel {
                                 " non-empty input values, got ",
                                 a_values.size(), " and ", b_values.size()));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape_t->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape_t->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc
index 6171b532aa243e6a3d8b42e5c8856aaa1c7ad207..67dcf05a6ced17fa2dbd44fb03dca21a032bcc5b 100644
--- a/tensorflow/core/kernels/sparse_split_op.cc
+++ b/tensorflow/core/kernels/sparse_split_op.cc
@@ -48,18 +48,20 @@ class SparseSplitOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape.shape().DebugString()));
 
-    OP_REQUIRES(context, input_shape.dim_size(0) &&
-                             split_dim < input_shape.vec<int64>().size(),
-                errors::InvalidArgument(
-                    "Input split_dim should be between 0 and rank (",
-                    input_shape.vec<int64>().size(), "), got ", split_dim));
-
-    OP_REQUIRES(context, num_split_ >= 1 &&
-                             num_split_ <= input_shape.vec<int64>()(split_dim),
-                errors::InvalidArgument("Input num_split should be between 1 "
-                                        "and the splitting dimension size (",
-                                        input_shape.vec<int64>()(split_dim),
-                                        "), got ", num_split_));
+    OP_REQUIRES(
+        context,
+        input_shape.dim_size(0) && split_dim < input_shape.vec<int64>().size(),
+        errors::InvalidArgument(
+            "Input split_dim should be between 0 and rank (",
+            input_shape.vec<int64>().size(), "), got ", split_dim));
+
+    OP_REQUIRES(
+        context,
+        num_split_ >= 1 && num_split_ <= input_shape.vec<int64>()(split_dim),
+        errors::InvalidArgument("Input num_split should be between 1 "
+                                "and the splitting dimension size (",
+                                input_shape.vec<int64>()(split_dim), "), got ",
+                                num_split_));
 
     sparse::SparseTensor sparse_tensor(input_indices, input_values,
                                        TensorShape(input_shape.vec<int64>()));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index 6a6cc3d81382a783aa9e34c841cb7be650dd7c87..ba3da21a4331562354e7dfce3348954fda3d46ad 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -73,8 +73,9 @@ class SparseToDense : public OpKernel {
     // sparse_values
     const Tensor& sparse_values = c->input(2);
     const int64 num_values = sparse_values.NumElements();
-    OP_REQUIRES(c, sparse_values.dims() == 0 ||
-                       (sparse_values.dims() == 1 && num_values == num_elems),
+    OP_REQUIRES(c,
+                sparse_values.dims() == 0 ||
+                    (sparse_values.dims() == 1 && num_values == num_elems),
                 errors::InvalidArgument("sparse_values has incorrect shape ",
                                         sparse_values.shape().DebugString(),
                                         ", should be [] or [", num_elems, "]"));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index f0d19da8046e7cb3c243f1e4e6c3266a0f96d921..d8b0f93082453bab574fe5fd5edbb78041efad54 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -38,7 +38,6 @@ namespace {
 
 class SparseToDenseTest : public OpsTestBase {
  protected:
-
   void MakeOp(int dim, DataType index_type, DataType value_type) {
     TF_ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
                      .Input(FakeInput(index_type))
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index c35ba42db2915216fe74a1f82d403e9b6803f63a..f84ffd53238f7753c1b4562268be9058c6c03e6d 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -39,10 +39,10 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) {
   if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) {
     bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first
                                                : *min_max_dim_value.second;
-    return errors::InvalidArgument("Received a label value of ", bad_index,
-                                   " which is outside the valid range of [0, ",
-                                   max_index, ").  Label values: ",
-                                   labels.SummarizeValue(labels.NumElements()));
+    return errors::InvalidArgument(
+        "Received a label value of ", bad_index,
+        " which is outside the valid range of [0, ", max_index,
+        ").  Label values: ", labels.SummarizeValue(labels.NumElements()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc
index b8ea0d2d7e279bc089aeb5574fc58c1af1686ca9..afb0bf76267f24ba1e2142954abfdcb41356cb96 100644
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@@ -41,10 +41,10 @@ static Graph* SparseXent(int batch_size, int num_classes) {
   return g;
 }
 
-#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                                \
-  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
+#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                          \
+  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
     testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);            \
+    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);      \
   }                                                                     \
   BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
 
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index ff92ffeeb38a964dcd068b54f9558ca8da7c969e..a08949e626cc8e5d4c3707b75a902d82b46c3376 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -57,7 +57,7 @@ struct Split<Eigen::SyclDevice, T> {
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 25026208d1ee78cb614e4ad41dccb7a0fa0f7817..771c633b156edf7c7d9944fe95703a0e0cd9e981 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -49,13 +49,13 @@ void Split<Eigen::SyclDevice, T>::operator()(
     typename TTypes<T, 3>::ConstTensor input,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
-    output.device(d) = input.slice(slice_indices, slice_sizes);
+  output.device(d) = input.slice(slice_indices, slice_sizes);
 }
 
 #define DEFINE_SYCL_KERNELS(T) template struct Split<Eigen::SyclDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 78badde27e5c4ca33faa00073e7b412e85d82970..85f529326dbf5d9d5ae72156da05f08f805d1271 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SplitOpBase : public OpKernel {
@@ -142,8 +142,9 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -245,10 +246,11 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
     const int32 num_split = Base::num_outputs();
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
     int32 prefix_dim_size;
     int32 split_dim_size;
     int32 suffix_dim_size;
@@ -304,8 +306,9 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -342,14 +345,14 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
             {prefix_dim_size, split_dim_output_size, suffix_dim_size});
 
         functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
-                                       result_shaped, input_reshaped,
-                                       slice_indices, slice_sizes);
+                                        result_shaped, input_reshaped,
+                                        slice_indices, slice_sizes);
       }
       indices[1] += split_dim_output_size;
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -381,11 +384,11 @@ REGISTER_GPU(bfloat16);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Split")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("split_dim"),   \
+#define REGISTER_SYCL(type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Split")                  \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("split_dim"),  \
                           SplitOpSYCL<type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index f1078ac349c979bb14f3949c05a7c493c9355567..7ff5df47d70fa8e47aabfb24e82874c146708ef1 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -197,8 +197,9 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -305,10 +306,11 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     const int32 split_dim_orig = context->input(2).flat<int32>()(0);
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
 
     int32 prefix_dim_size;
     int32 split_dim_size;
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index affe81a55567d1ef304c7161c65c201021da1363..65296f61fd180e2f57855d4cee1566bf827dd46a 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -42,7 +42,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -242,7 +242,7 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                             .HostMemory("max_size")
                             .HostMemory("handle"),
                         StackOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -274,11 +274,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        ( std::is_same<Device, GPUDevice>::value
+        (std::is_same<Device, GPUDevice>::value
 #ifdef TENSORFLOW_USE_SYCL
-          || std::is_same<Device, SYCLDevice>::value
-#endif // TENSORFLOW_USE_SYCL
-        ) &&
+         || std::is_same<Device, SYCLDevice>::value
+#endif  // TENSORFLOW_USE_SYCL
+         ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -391,7 +391,7 @@ REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackPopOp : public AsyncOpKernel {
  public:
@@ -498,7 +498,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackCloseOp : public OpKernel {
  public:
@@ -526,6 +526,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"),
     StackCloseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 0fae46dea61d361bd4ead0afc0fa33711407fc9b..03fc4467a1dcf9d70c90c19809690934b0a7c2f4 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -70,12 +70,11 @@ class Buffer : public ResourceBase {
     return bytes + current_bytes_ > memory_limit_;
   }
 
-  std::size_t GetTupleBytes(const Tuple & tuple)
-  {
+  std::size_t GetTupleBytes(const Tuple& tuple) {
     return std::accumulate(tuple.begin(), tuple.end(), 0,
-      [](const std::size_t & lhs, const Tensor & rhs) {
-        return lhs + rhs.TotalBytes();
-    });
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
   }
 
  public:
@@ -90,19 +89,22 @@ class Buffer : public ResourceBase {
     std::size_t tuple_bytes = GetTupleBytes(*tuple);
 
     // Sanity check so that we don't block for ever below
-    if(memory_limit_ > 0 && tuple_bytes > memory_limit_) {
-      return Status(errors::ResourceExhausted("Attempted to insert "
-        "tensors with combined size of '", tuple_bytes, "' bytes into "
-        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    if (memory_limit_ > 0 && tuple_bytes > memory_limit_) {
+      return Status(
+          errors::ResourceExhausted("Attempted to insert "
+                                    "tensors with combined size of '",
+                                    tuple_bytes,
+                                    "' bytes into "
+                                    "Staging Area with a memory limit of '",
+                                    memory_limit_, "'."));
     }
 
-
     // If buffer capacity is bounded wait until elements have been removed
-    if(IsBounded()) {
+    if (IsBounded()) {
       full_cond_var_.wait(lock, [tuple_bytes, this]() {
         // If there's a memory limit, check if there's space for insertion
-        bool memory_limit_valid = memory_limit_ > 0 ?
-            !WouldExceedMemoryLimit(tuple_bytes) : true;
+        bool memory_limit_valid =
+            memory_limit_ > 0 ? !WouldExceedMemoryLimit(tuple_bytes) : true;
         // If we're configured for capacity check if there's space for insertion
         bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
 
@@ -186,8 +188,7 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
   ContainerInfo cinfo;
 
   // Lambda for creating the Staging Area
-  auto create_fn = [&ndef](Buffer** ret) -> Status
-  {
+  auto create_fn = [&ndef](Buffer** ret) -> Status {
     int64 capacity;
     int64 memory_limit;
     TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
@@ -196,7 +197,6 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
     return Status::OK();
   };
 
-
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
                                                 buf, create_fn));
@@ -228,7 +228,7 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -244,7 +244,8 @@ class UnstageOp : public OpKernel {
 
     buf->Get(&tuple);
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -260,7 +261,7 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StagePeekOp : public OpKernel {
  public:
@@ -278,7 +279,8 @@ class StagePeekOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple));
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -288,17 +290,15 @@ class StagePeekOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU),
-                                              StagePeekOp);
+REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index").
-                            Device(DEVICE_GPU), StagePeekOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index")
-                          .Device(DEVICE_SYCL), StagePeekOp);
-#endif // TENSORFLOW_USE_SYCL
-
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageSizeOp : public OpKernel {
  public:
@@ -312,9 +312,8 @@ class StageSizeOp : public OpKernel {
     core::ScopedUnref scope(buf);
 
     // Allocate size output tensor
-    Tensor * size = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
-                                                     &size));
+    Tensor* size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
     size->scalar<int32>().setConstant(buf->Size());
@@ -323,13 +322,13 @@ class StageSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_GPU), StageSizeOp);
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
+                        StageSizeOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_SYCL), StageSizeOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageClearOp : public OpKernel {
  public:
@@ -352,7 +351,6 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
-#endif // TENSORFLOW_USE_SYCL
-
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7c213e14d21efd6fcf033d3cd341c35838fe9f7b..e0b85c6d063d5e254973fa4eca58cfa04268a0e5 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -294,6 +294,11 @@ class StridedSliceAssignOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
       old_lhs = *v->tensor();
+      OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum<T>::value,
+                  errors::InvalidArgument(
+                      "l-value dtype ", DataTypeString(old_lhs.dtype()),
+                      " does not match r-value dtype ",
+                      DataTypeString(DataTypeToEnum<T>::value)));
     } else {
       context->forward_ref_input_to_ref_output(0, 0);
       old_lhs = context->mutable_input(0, true);
@@ -386,6 +391,7 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
+TF_CALL_variant(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
@@ -541,5 +547,5 @@ REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index a84ba38ef41486f86f5e37bd95287b8ae6c9bb2e..c3187e49cea71f3ac5a26071d376c7e511e65e57 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -288,6 +290,7 @@ DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
+TF_CALL_variant(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
@@ -302,7 +305,7 @@ DECLARE_FOR_N_SYCL(int32);
 DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc
index 721702bec68efa24d4dafef1e9aaa0c5f1b4c849..28cca9f44849b39647ba08c54d9e1f3c108f91fd 100644
--- a/tensorflow/core/kernels/string_join_op.cc
+++ b/tensorflow/core/kernels/string_join_op.cc
@@ -50,9 +50,9 @@ class StringJoinOp : public OpKernel {
         } else {
           OP_REQUIRES(
               context, input_shape == input.shape(),
-              errors::InvalidArgument("Input shapes do not match: ",
-                                      input_shape.DebugString(), " vs. ",
-                                      input.shape().DebugString()));
+              errors::InvalidArgument(
+                  "Input shapes do not match: ", input_shape.DebugString(),
+                  " vs. ", input.shape().DebugString()));
         }
       }
     }
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 743f11315042af94cfe41cecf52d145ae69f8209..e29f67297f9ce4a99898b256deda46ba95362904 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -95,9 +95,9 @@ class SubstrOp : public OpKernel {
       // Create BCast helper with shape of input and pos/len
       BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(pos_shape));
       OP_REQUIRES(context, bcast.IsValid(),
-                  errors::InvalidArgument("Incompatible shapes: ",
-                                          input_shape.DebugString(), " vs. ",
-                                          pos_shape.DebugString()));
+                  errors::InvalidArgument(
+                      "Incompatible shapes: ", input_shape.DebugString(),
+                      " vs. ", pos_shape.DebugString()));
       TensorShape output_shape = BCast::ToShape(bcast.result_shape());
       int ndims = output_shape.dims();
       Tensor* output_tensor = nullptr;
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 233b824bcc3bab74d70c990c44389e6df7b10f02..29b21ee7353fe03ce87bc03dad72b05ca8fd4311 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -54,18 +54,20 @@ class SummaryImageOp : public OpKernel {
     const Tensor& tensor = c->input(1);
     OP_REQUIRES(c, IsLegacyScalar(tags.shape()),
                 errors::InvalidArgument("Tags must be a scalar"));
-    OP_REQUIRES(c, tensor.dims() == 4 &&
-                       (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
-                        tensor.dim_size(3) == 4),
+    OP_REQUIRES(c,
+                tensor.dims() == 4 &&
+                    (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+                     tensor.dim_size(3) == 4),
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
     const string& base_tag = tags.scalar<string>()();
 
-    OP_REQUIRES(c, tensor.dim_size(0) < (1LL << 31) &&
-                       tensor.dim_size(1) < (1LL << 31) &&
-                       tensor.dim_size(2) < (1LL << 31) &&
-                       (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
+    OP_REQUIRES(c,
+                tensor.dim_size(0) < (1LL << 31) &&
+                    tensor.dim_size(1) < (1LL << 31) &&
+                    tensor.dim_size(2) < (1LL << 31) &&
+                    (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
                 errors::InvalidArgument("Tensor too large for summary ",
                                         tensor.shape().DebugString()));
 
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 41cbece1d648f3e2dba112375e494d2ed8192db9..d317a8d33db5b69a84a0d193cb6322afaa53dff6 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -42,11 +42,16 @@ class CreateSummaryFileWriterOp : public OpKernel {
     const int32 flush_millis = tmp->scalar<int32>()();
     OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp));
     const string filename_suffix = tmp->scalar<string>()();
-    SummaryWriterInterface* s;
-    OP_REQUIRES_OK(ctx,
-                   CreateSummaryFileWriter(max_queue, flush_millis, logdir,
-                                           filename_suffix, ctx->env(), &s));
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
+
+    SummaryWriterInterface* s = nullptr;
+    OP_REQUIRES_OK(ctx, LookupOrCreateResource<SummaryWriterInterface>(
+                            ctx, HandleFromInput(ctx, 0), &s,
+                            [max_queue, flush_millis, logdir, filename_suffix,
+                             ctx](SummaryWriterInterface** s) {
+                              return CreateSummaryFileWriter(
+                                  max_queue, flush_millis, logdir,
+                                  filename_suffix, ctx->env(), s);
+                            }));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
@@ -66,17 +71,23 @@ class CreateSummaryDbWriterOp : public OpKernel {
     const string run_name = tmp->scalar<string>()();
     OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
     const string user_name = tmp->scalar<string>()();
-    SummaryWriterInterface* s;
-    Sqlite* db;
-    OP_REQUIRES_OK(ctx, Sqlite::Open(db_uri,
-                                     SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE,
-                                     &db));
-    core::ScopedUnref unref(db);
-    OP_REQUIRES_OK(ctx, SetupTensorboardSqliteDb(db));
+
+    SummaryWriterInterface* s = nullptr;
     OP_REQUIRES_OK(
-        ctx, CreateSummaryDbWriter(db, experiment_name,
-                                   run_name, user_name, ctx->env(), &s));
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
+        ctx,
+        LookupOrCreateResource<SummaryWriterInterface>(
+            ctx, HandleFromInput(ctx, 0), &s,
+            [db_uri, experiment_name, run_name, user_name,
+             ctx](SummaryWriterInterface** s) {
+              Sqlite* db;
+              TF_RETURN_IF_ERROR(Sqlite::Open(
+                  db_uri, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, &db));
+              core::ScopedUnref unref(db);
+              TF_RETURN_IF_ERROR(SetupTensorboardSqliteDb(db));
+              TF_RETURN_IF_ERROR(CreateSummaryDbWriter(
+                  db, experiment_name, run_name, user_name, ctx->env(), s));
+              return Status::OK();
+            }));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
@@ -267,8 +278,6 @@ class WriteAudioSummaryOp : public OpKernel {
 
  private:
   int max_outputs_;
-  bool has_sample_rate_attr_;
-  float sample_rate_attr_;
 };
 REGISTER_KERNEL_BUILDER(Name("WriteAudioSummary").Device(DEVICE_CPU),
                         WriteAudioSummaryOp);
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index b818724ec2e895d3995fe19b811327ed0ba112c2..1f4e3418f4826dee789002d4aa688f8ce14e17d2 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -41,11 +41,12 @@ class SummaryScalarOp : public OpKernel {
     const Tensor& values = c->input(1);
 
     OP_REQUIRES(
-        c, tags.IsSameSize(values) ||
-               (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
-        errors::InvalidArgument("tags and values not the same shape: ",
-                                tags.shape().DebugString(), " != ",
-                                values.shape().DebugString(), SingleTag(tags)));
+        c,
+        tags.IsSameSize(values) ||
+            (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
+        errors::InvalidArgument(
+            "tags and values not the same shape: ", tags.shape().DebugString(),
+            " != ", values.shape().DebugString(), SingleTag(tags)));
     auto Ttags = tags.flat<string>();
     auto Tvalues = values.flat<T>();
     Summary s;
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index dedc2da60bab0d0c0613630c384c2f23ddae31e3..8c3a58b108abe66f2b61b5153923bee192246cd1 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -63,8 +63,8 @@ __global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
                                       int64 ldu, const Scalar* M,
                                       const Scalar* U, const Scalar* S,
                                       Scalar* V) {
-  CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count, y) {
+  CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count.x, X) {
+    CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count.y, Y) {
       Scalar v = M[i + m * batch] * U[ldu * (i + m * batch)] * S[batch];
       CudaAtomicAdd(V + batch, v);
     }
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index b2fd669541d32406512c4618fac77604baefedbe..f8144867014eccf04c892d0ce90a2aa280dfd764 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/tile_functor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/tile_functor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 054b31ef9e0b4904d8803d1c4542ff805e0a7673..df6a666cd441d9c1306d950bbe0e79bf3dae28d9 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -63,7 +63,7 @@ TF_CALL_int64(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 38e77ab60fb7126bcdedc09bfe9e2ec7de88c0ad..07befa27bc54631d30e413a15972c560655418e0 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -3279,7 +3279,6 @@ REGISTER_KERNELS(double, int64);
 
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyAddSignOp : public OpKernel {
  public:
@@ -3362,17 +3361,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyAddSign<GPUDevice, T>::operator()(                            \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar alpha,                              \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyAddSign<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar alpha,                          \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyAddSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -3387,7 +3384,6 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyPowerSignOp : public OpKernel {
  public:
@@ -3470,17 +3466,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyPowerSign<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar logbase,                            \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyPowerSign<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar logbase,                        \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyPowerSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index d443a6b3c1d0b548e915216adbc05549a66eaeda..0376a3b2c602c13b3082b7762cf61a2b30552199 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_ops.h"
 
 namespace tensorflow {
 
@@ -115,13 +115,11 @@ struct ApplyAdam<GPUDevice, T> {
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
     m.device(d) =
-        m +
-        (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-            (grad - m);
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
     v.device(d) =
-        v +
-        (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
-            (grad.square() - v);
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
 
     if (use_nesterov) {
       var.device(d) -=
@@ -157,9 +155,9 @@ struct ApplyRMSProp<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
-    ms.device(d) = ms +
-                   (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
-                       (grad.square() - ms);
+    ms.device(d) =
+        ms + (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
+                 (grad.square() - ms);
     mom.device(d) =
         mom * momentum.reshape(single).broadcast(bcast) +
         lr.reshape(single).broadcast(bcast) * grad /
@@ -212,7 +210,7 @@ struct ApplyAddSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
@@ -244,7 +242,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
@@ -253,7 +251,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto lr_bcast = lr.reshape(single).broadcast(bcast);
     auto logbase_bcast = logbase.reshape(single).broadcast(bcast);
     auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
-    auto grad_scale =  (logbase_bcast * sign_decay_bcast * sign_gm).exp();
+    auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp();
     var.device(d) -= lr_bcast * grad_scale * grad;
   }
 };
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index ffa7f87c9efda0e3288b9fb06d0c9d1a3dcba277..2dcc4a500e6c64753c6fde4f88582f914a50089e 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -176,8 +176,9 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) {
     auto beta2 = Scalar(g, 0.99);
     auto epsilon = Scalar(g, 1e-8);
     auto grad = Random(g, n);
-    test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr,
-                                        beta1, beta2, epsilon, grad});
+    test::graph::Multi(
+        g, "ApplyAdam",
+        {var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad});
     *train_g = g;
   }
 }
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 2e0d18b634a8aebeaf2b7a0118ea8a9367804086..7177ad78884cae85a847a283017511dcad2e4878 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -176,9 +176,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
     }
   }
   for (int i = 0; i < dims; ++i) {
-    OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
-                                  i, " is missing from {",
-                                  str_util::Join(permutation, ","), "}."));
+    OP_REQUIRES(
+        ctx, bits[i],
+        errors::InvalidArgument(i, " is missing from {",
+                                str_util::Join(permutation, ","), "}."));
   }
 
   // 0-D, 1-D, and identity transposes do nothing.
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index 0d608d9b8799d561141ac2d3378a0f0e3507acfd..43dcb4cef74c568a6bc31abc8c460cff241fc6fa 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -58,9 +58,9 @@ Status TypedQueue<SubQueue>::Initialize() {
   if (!component_shapes_.empty() &&
       component_dtypes_.size() != component_shapes_.size()) {
     return errors::InvalidArgument(
-        "Different number of component types.  ", "Types: ",
-        DataTypeSliceString(component_dtypes_), ", Shapes: ",
-        ShapeListString(component_shapes_));
+        "Different number of component types.  ",
+        "Types: ", DataTypeSliceString(component_dtypes_),
+        ", Shapes: ", ShapeListString(component_shapes_));
   }
 
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 397bdd56708d766d06e5a68f3b049a5b928195e1..764b6a252adf09c13511a01f95332857f46eee96 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -34,7 +34,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class UnpackOp : public OpKernel {
@@ -65,8 +65,9 @@ class UnpackOp : public OpKernel {
     output_shape.RemoveDim(axis);
     const int64 output_size = output_shape.num_elements();
     OP_REQUIRES(
-        context, FastBoundsCheck(output_size,
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(output_size,
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
 // This optimization is currently not applicable for SYCL devices
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da9ab01e8db0ff024f6d1dfe1b3e6eb04b394a30
--- /dev/null
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+template <typename T>
+struct mod_op {
+  const T operator()(const T& a, const T& b) const { return a % b; }
+};
+}  // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Tidx>
+class UnravelIndexOp : public OpKernel {
+ public:
+  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& indices_tensor = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices_tensor.shape()) ||
+                         TensorShapeUtils::IsScalar(indices_tensor.shape()),
+                errors::InvalidArgument(
+                    "The indices can only be scalar or vector, got \"",
+                    indices_tensor.shape().DebugString(), "\""));
+
+    const Tensor& dims_tensor = ctx->input(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(dims_tensor.shape()),
+        errors::InvalidArgument("The indices can only be 1-D, got \"",
+                                dims_tensor.shape().DebugString(), "\""));
+
+    auto dims = dims_tensor.vec<Tidx>();
+
+    Eigen::array<bool, 1> reverse({true});
+
+    Tensor strides_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_tensor));
+
+    auto strides = strides_tensor.vec<Tidx>();
+    strides = dims.reverse(reverse)
+                  .scan(0, Eigen::internal::ProdReducer<Tidx>(), false)
+                  .reverse(reverse);
+
+    Tensor strides_shifted_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_shifted_tensor));
+
+    auto strides_shifted = strides_shifted_tensor.vec<Tidx>();
+    strides_shifted = dims.reverse(reverse)
+                          .scan(0, Eigen::internal::ProdReducer<Tidx>(), true)
+                          .reverse(reverse);
+
+    Tensor* output_tensor = nullptr;
+    if (TensorShapeUtils::IsScalar(indices_tensor.shape())) {
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}),
+                                    &output_tensor));
+
+      auto output = output_tensor->vec<Tidx>();
+
+      output = output.constant(indices_tensor.scalar<Tidx>()());
+      output = output.binaryExpr(strides, mod_op<Tidx>()) / strides_shifted;
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                              0, TensorShape({dims_tensor.NumElements(),
+                                              indices_tensor.NumElements()}),
+                              &output_tensor));
+
+      auto output = output_tensor->matrix<Tidx>();
+
+      Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}};
+      Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()});
+      Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
+      Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
+
+      output = indices_tensor.vec<Tidx>()
+                   .reshape(indices_reshape)
+                   .broadcast(indices_bcast);
+      output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast),
+                                 mod_op<Tidx>()) /
+               strides_shifted.reshape(reshape).broadcast(bcast);
+    }
+  }
+};
+
+#define REGISTER_KERNEL(type)                                               \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint<type>("Tidx"), \
+      UnravelIndexOp<type>);
+TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL)
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 2d05d72bff162f98e8d13e8a3208e4dd00a48fa4..3477445197a961b275e3efb8ce09d5b075342f9e 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -188,9 +188,9 @@ class SkipgramOp : public OpKernel {
       ++corpus_size_;
     }
     if (corpus_size_ < window_size_ * 10) {
-      return errors::InvalidArgument("The text file ", filename,
-                                     " contains too little data: ",
-                                     corpus_size_, " words");
+      return errors::InvalidArgument(
+          "The text file ", filename,
+          " contains too little data: ", corpus_size_, " words");
     }
     typedef std::pair<string, int32> WordFreq;
     std::vector<WordFreq> ordered;
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 0f8d027caadab2dee04d3041ed515a40f22476f3..a6a71fdfaf126410b26766954c0c2fc5b86d003a 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -44,8 +44,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
     OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
                 errors::InvalidArgument(
                     "logits and labels must be same size: logits_size=",
-                    logits_in.shape().DebugString(), " labels_size=",
-                    labels_in.shape().DebugString()));
+                    logits_in.shape().DebugString(),
+                    " labels_size=", labels_in.shape().DebugString()));
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     // As we already tested that both inputs have the same shape no need to
@@ -72,7 +72,7 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
       functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
               labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
               back_out->matrix<T>());
-      }
+    }
   }
 };
 
@@ -87,7 +87,7 @@ struct XentFunctorBase {
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
     XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
-                                         backprop);
+                                      backprop);
   }
 };
 
@@ -97,7 +97,7 @@ struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -129,6 +129,6 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<float>("T"),
                         SoftmaxXentWithLogitsOp<SYCLDevice, float>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index e29470124674636a0e125a5cd1b856a467f4c6f0..481f3b7ba46bac42a276d46e60c11f34bc163e3b 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/platform/test.h"
+#include "include/libxsmm.h"
+#include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "include/libxsmm.h"
-#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-
 typedef struct {
   int nImg;
   int nIfm;
@@ -49,45 +48,41 @@ typedef struct {
   int stride_w;
 } naive_conv_t;
 
-
-LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, nchw, C, H, W);
+LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc,
+                                            int N, int H, int W, int C) {
+  LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W);
   int n, h, w, c;
-  auto output =  nhwc.flat<float>();
-  for ( n = 0; n < N; n++ ) {
-    for ( h = 0; h < H; h++ ) {
-      for ( w = 0; w < W; w++ ) {
-        for ( c = 0; c < C; c++ ) {
-          output(n*H*W*C + h*W*C +w*C + c)  =
-          LIBXSMM_VLA_ACCESS(4,  input, n, c, h, w, C, H, W);
+  auto output = nhwc.flat<float>();
+  for (n = 0; n < N; n++) {
+    for (h = 0; h < H; h++) {
+      for (w = 0; w < W; w++) {
+        for (c = 0; c < C; c++) {
+          output(n * H * W * C + h * W * C + w * C + c) =
+              LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W);
         }
       }
     }
   }
 }
 
-
-LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, int R, int S, int C, int K)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
+LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck,
+                                            int R, int S, int C, int K) {
+  LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S);
   int r, s, c, k;
-  auto output =  rsck.flat<float>();
-
-  for ( r = 0; r < R; r++ ) {
-    for ( s = 0; s < S; s++ ) {
-      for ( c = 0; c < C; c++ ) {
-        for ( k = 0; k < K; k++ ) {
-          output(r*S*C*K + s*C*K + c*K + k) =
-          LIBXSMM_VLA_ACCESS(4,  input, k, c, r, s, C, R, S);
+  auto output = rsck.flat<float>();
+
+  for (r = 0; r < R; r++) {
+    for (s = 0; s < S; s++) {
+      for (c = 0; c < C; c++) {
+        for (k = 0; k < K; k++) {
+          output(r * S * C * K + s * C * K + c * K + k) =
+              LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S);
         }
       }
     }
   }
 }
 
-
-
 LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   int i;
   for (i = 0; i < size; ++i) {
@@ -95,52 +90,53 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   }
 }
 
-LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
-  long  i;
-  auto output =  dst.flat<float>();
-  for (i = 0; i < size; ++i)
-          output(i) = src[i];
+LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) {
+  long i;
+  auto output = dst.flat<float>();
+  for (i = 0; i < size; ++i) output(i) = src[i];
 }
 
-LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
-{
+LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) {
   int i;
   zero_buf(buf, size);
   for (i = 0; i < size; ++i) {
-    buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
+    buf[i] =
+        (float)((initOne != 0)
+                    ? 1.0
+                    : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0)));
   }
 }
 
-
-
-LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
-{
-  int nImg      = param->nImg;
-  int nIfm      = param->nIfm;
-  int nOfm      = param->nOfm;
-  int ifhp      = param->ifhp;
-  int ifwp      = param->ifwp;
-  int ofhp      = param->ofhp;
-  int ofwp      = param->ofwp;
-  int ifh       = param->ifh;
-  int ifw       = param->ifw;
-  int ofh       = param->ofh;
-  int ofw       = param->ofw;
-  int pad_h     = param->pad_h;
-  int pad_w     = param->pad_w;
-  int pad_h_in  = param->pad_h_in;
-  int pad_w_in  = param->pad_w_in;
+LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input,
+                                  float* output, const float* filter) {
+  int nImg = param->nImg;
+  int nIfm = param->nIfm;
+  int nOfm = param->nOfm;
+  int ifhp = param->ifhp;
+  int ifwp = param->ifwp;
+  int ofhp = param->ofhp;
+  int ofwp = param->ofwp;
+  int ifh = param->ifh;
+  int ifw = param->ifw;
+  int ofh = param->ofh;
+  int ofw = param->ofw;
+  int pad_h = param->pad_h;
+  int pad_w = param->pad_w;
+  int pad_h_in = param->pad_h_in;
+  int pad_w_in = param->pad_w_in;
   int pad_h_out = param->pad_h_out;
   int pad_w_out = param->pad_w_out;
-  int kh        = param->kh;
-  int kw        = param->kw;
-  int stride_h  = param->stride_h;
-  int stride_w  = param->stride_w;
+  int kh = param->kh;
+  int kw = param->kw;
+  int stride_h = param->stride_h;
+  int stride_w = param->stride_w;
   /* loop counters */
   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
 
-  LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
-  LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
+  LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out),
+                   nOfm, ofhp, ofwp);
+  LIBXSMM_VLA_DECL(4, const float, input_t,
+                   input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
 
   for (img = 0; img < nImg; ++img) {
@@ -151,12 +147,15 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
           for (oi = 0; oi < ofw; ++oi) {
             ii = oi * stride_w - pad_w;
             for (kj = 0; kj < kh; ++kj) {
-              if(ij+kj < 0 || ij+kj >= ifh) continue;
+              if (ij + kj < 0 || ij + kj >= ifh) continue;
               for (ki = 0; ki < kw; ++ki) {
-                if(ii+ki < 0 || ii+ki >= ifw) continue;
-                LIBXSMM_VLA_ACCESS(  4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) +=
-                  LIBXSMM_VLA_ACCESS(4,  input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)
-                * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw);
+                if (ii + ki < 0 || ii + ki >= ifw) continue;
+                LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp,
+                                   ofwp) +=
+                    LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki,
+                                       nIfm, ifhp, ifwp) *
+                    LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh,
+                                       kw);
               }
             }
           }
@@ -168,134 +167,118 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
 
 void RunXsmmVsGeneric() {}
 
-
 class XsmmConv2DTest : public OpsTestBase {
  protected:
   void MakeOp(int stride) {
-
     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
-                      .Input(FakeInput(DT_FLOAT))
-                      .Input(FakeInput(DT_FLOAT))
-                      .Attr("strides", {1, stride,stride, 1})
-                      .Attr("padding", "VALID" )
-                      .Finalize(node_def()));
-
+                    .Input(FakeInput(DT_FLOAT))
+                    .Input(FakeInput(DT_FLOAT))
+                    .Attr("strides", {1, stride, stride, 1})
+                    .Attr("padding", "VALID")
+                    .Finalize(node_def()));
 
     TF_ASSERT_OK(InitOp());
   }
 };
 
 TEST_F(XsmmConv2DTest, Basic) {
-     MakeOp(1);
+  MakeOp(1);
 
-     // setup scoped allocator, which uses cpu_allocator() for this scope
-     const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
+  // setup scoped allocator, which uses cpu_allocator() for this scope
+  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
 
-     int ifw = 14;           /* input width, "W" */
-     int ifh = 14;           /* input height, "H" */
-     int nImg = 32;          /* mini-batch size, "N" */
-     int nIfm = 64;         /* number of input feature maps, "C" */
-     int nOfm = 64;         /* number of output feature maps, "K" */
-     int kh = 3;             /* filter height, "R" */
-     int kw = 3;             /* filter width, "S" */
-     int pad = 0;            /* padding in output */
-     int stride = 1;         /* stride when accessing inputs */
+  int ifw = 14;   /* input width, "W" */
+  int ifh = 14;   /* input height, "H" */
+  int nImg = 32;  /* mini-batch size, "N" */
+  int nIfm = 64;  /* number of input feature maps, "C" */
+  int nOfm = 64;  /* number of output feature maps, "K" */
+  int kh = 3;     /* filter height, "R" */
+  int kw = 3;     /* filter width, "S" */
+  int pad = 0;    /* padding in output */
+  int stride = 1; /* stride when accessing inputs */
 
+  int stride_w = stride;
+  int stride_h = stride;
+  int pad_h = pad;
+  int pad_w = pad;
 
-     int stride_w = stride;
-     int stride_h = stride;
-     int pad_h = pad;
-     int pad_w = pad;
+  int pad_h_in = pad_h;
+  int pad_w_in = pad_w;
 
-     int pad_h_in = pad_h;
-     int pad_w_in = pad_w;
-
-     int pad_h_out = 0;
-     int pad_w_out = 0;
+  int pad_h_out = 0;
+  int pad_w_out = 0;
 
   /* deriving some values for naive code */
-     int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
-     int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
-     int ifhp = ifh + 2 * pad_h_in;
-     int ifwp = ifw + 2 * pad_w_in;
-     int ofhp = ofh + 2 * pad_h_out;
-     int ofwp = ofw + 2 * pad_w_out;
-
-
-    //Initialization of Filter and Image
-
-    /* allocate data */
-     float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
-     float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
-     float *naive_filter          = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
-     /* initialize data */
-     init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
-     zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
-     init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
-
-
-     Tensor image(DT_FLOAT,
-                 {nImg, ifhp, ifwp, nIfm});
-
-
-     Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
-
-
-     naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
-     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
-
-
-    //Run naive convolution
-
-     naive_conv_t naive_param;
-
-     naive_param.nImg = nImg;
-     naive_param.nIfm = nIfm;
-     naive_param.nOfm = nOfm;
-     naive_param.ifhp = ifhp;
-     naive_param.ifwp = ifwp;
-     naive_param.ofhp = ofhp;
-     naive_param.ofwp = ofwp;
-     naive_param.ifh = ifh;
-     naive_param.ifw = ifw;
-     naive_param.ofh = ofh;
-     naive_param.ofw = ofw;
-     naive_param.pad_h = pad_h;
-     naive_param.pad_w = pad_w;
-     naive_param.pad_h_in = pad_h_in;
-     naive_param.pad_w_in = pad_w_in;
-     naive_param.pad_h_out = pad_h_out;
-     naive_param.pad_w_out = pad_w_out;
-     naive_param.kh = kh;
-     naive_param.kw = kw;
-     naive_param.stride_h = stride_h;
-     naive_param.stride_w = stride_w;
-
-
-     naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
-
-
-
-     AddInputFromArray<float>(image.shape(), image.flat<float>());
-     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
-
-
-
-     //Run Op (TF)
-     TF_ASSERT_OK(RunOpKernel());
-
-     // Check the output.
-     Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
-     naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
-
-
-     test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
-     libxsmm_free(naive_input);
-     libxsmm_free(naive_output);
-     libxsmm_free(naive_filter);
-
-
-
+  int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
+  int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
+  int ifhp = ifh + 2 * pad_h_in;
+  int ifwp = ifw + 2 * pad_w_in;
+  int ofhp = ofh + 2 * pad_h_out;
+  int ofwp = ofw + 2 * pad_w_out;
+
+  // Initialization of Filter and Image
+
+  /* allocate data */
+  float* naive_input = (float*)libxsmm_aligned_scratch(
+      nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152);
+  float* naive_output = (float*)libxsmm_aligned_scratch(
+      nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152);
+  float* naive_filter = (float*)libxsmm_aligned_scratch(
+      nOfm * nIfm * kh * kw * sizeof(float), 2097152);
+  /* initialize data */
+  init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0);
+  zero_buf(naive_output, nImg * nOfm * ofhp * ofwp);
+  init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0);
+
+  Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm});
+
+  Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm});
+
+  naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
+  naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
+
+  // Run naive convolution
+
+  naive_conv_t naive_param;
+
+  naive_param.nImg = nImg;
+  naive_param.nIfm = nIfm;
+  naive_param.nOfm = nOfm;
+  naive_param.ifhp = ifhp;
+  naive_param.ifwp = ifwp;
+  naive_param.ofhp = ofhp;
+  naive_param.ofwp = ofwp;
+  naive_param.ifh = ifh;
+  naive_param.ifw = ifw;
+  naive_param.ofh = ofh;
+  naive_param.ofw = ofw;
+  naive_param.pad_h = pad_h;
+  naive_param.pad_w = pad_w;
+  naive_param.pad_h_in = pad_h_in;
+  naive_param.pad_w_in = pad_w_in;
+  naive_param.pad_h_out = pad_h_out;
+  naive_param.pad_w_out = pad_w_out;
+  naive_param.kh = kh;
+  naive_param.kw = kw;
+  naive_param.stride_h = stride_h;
+  naive_param.stride_w = stride_w;
+
+  naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
+
+  AddInputFromArray<float>(image.shape(), image.flat<float>());
+  AddInputFromArray<float>(filter.shape(), filter.flat<float>());
+
+  // Run Op (TF)
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm});
+  naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
+
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+  libxsmm_free(naive_input);
+  libxsmm_free(naive_output);
+  libxsmm_free(naive_filter);
 }
 
 /*
@@ -325,7 +308,8 @@ TEST(XsmmConv2DTest, Basic) {
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.filter_format =
+LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 58a50a70c26a63a9edd55349e2253a9ace16f1f2..49f74ff47fbc839c84465ba86e85b38cb3bd38ec 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -131,7 +131,7 @@ inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
   while (auto _result = ::tensorflow::TfCheckOpHelper(val, #val)) \
   LOG(level) << *(_result)
 
-#define TF_CHECK_OK(val)  TF_DO_CHECK_OK(val, FATAL)
+#define TF_CHECK_OK(val) TF_DO_CHECK_OK(val, FATAL)
 #define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL)
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 2b10ebeaf7cbed4a8466a69898d6d4d6660ed5cb..e55ed79d36cd2db7a6f6b19f3579f47e73b4b2d9 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -66,7 +66,9 @@ struct EigenEnvironment {
     }
     return Task{
         std::unique_ptr<TaskImpl>(new TaskImpl{
-            std::move(f), Context(ContextKind::kThread), id,
+            std::move(f),
+            Context(ContextKind::kThread),
+            id,
         }),
     };
   }
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 49ddb16645c32a82d90eafa5f550b8887ac84b79..627ef5a892a35ec43d0c31220dcf046b4b8eda55 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -97,8 +97,8 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
     }
     pool.ParallelForWithWorkerId(
         kWorkItems, kHugeCost,
-        [&threads_running, &work, num_threads](
-            int64 begin, int64 end, int64 id) {
+        [&threads_running, &work, num_threads](int64 begin, int64 end,
+                                               int64 id) {
           // Store true for the current thread, and assert that another thread
           // is not running with the same id.
           ASSERT_LE(0, id);
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 0faa458f1d692a103099d5b05d0400944ffdaad7..efe97f78d259199a74bf5e830f70de657d1cd679 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -18,12 +18,12 @@ limitations under the License.
 #include <mutex>
 
 #include "sqlite3.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/lib/core/refcount.h"
 
 /// TensorFlow SQLite Veneer
 ///
@@ -121,10 +121,7 @@ class LOCKABLE Sqlite : public core::RefCounted {
 
   Sqlite(sqlite3* db, sqlite3_stmt* begin, sqlite3_stmt* commit,
          sqlite3_stmt* rollback) noexcept
-      : db_(db),
-        begin_(begin),
-        commit_(commit),
-        rollback_(rollback) {}
+      : db_(db), begin_(begin), commit_(commit), rollback_(rollback) {}
 
   sqlite3* const db_;
   sqlite3_stmt* const begin_;
@@ -233,7 +230,8 @@ class SqliteStatement {
   /// freed until this statement is Reset() or finalized.
   void BindText(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_TRANSIENT, SQLITE_UTF8), parameter);
+                               SQLITE_TRANSIENT, SQLITE_UTF8),
+           parameter);
     size_ += text.size();
   }
   void BindText(const char* parameter, const StringPiece& text) {
@@ -241,7 +239,8 @@ class SqliteStatement {
   }
   void BindTextUnsafe(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_STATIC, SQLITE_UTF8), parameter);
+                               SQLITE_STATIC, SQLITE_UTF8),
+           parameter);
     size_ += text.size();
   }
   void BindTextUnsafe(const char* parameter, const StringPiece& text) {
@@ -254,7 +253,8 @@ class SqliteStatement {
   /// freed until this statement is Reset() or finalized.
   void BindBlob(int parameter, const StringPiece& blob) {
     Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_TRANSIENT), parameter);
+                               SQLITE_TRANSIENT),
+           parameter);
     size_ += blob.size();
   }
   void BindBlob(const char* parameter, const StringPiece& blob) {
@@ -262,7 +262,8 @@ class SqliteStatement {
   }
   void BindBlobUnsafe(int parameter, const StringPiece& blob) {
     Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_STATIC), parameter);
+                               SQLITE_STATIC),
+           parameter);
     size_ += blob.size();
   }
   void BindBlobUnsafe(const char* parameter, const StringPiece& text) {
@@ -320,9 +321,7 @@ class SqliteStatement {
 
   /// \brief Move constructor, after which <other> is reset to empty.
   SqliteStatement(SqliteStatement&& other) noexcept
-      : db_(other.db_),
-        stmt_(other.stmt_),
-        bind_error_(other.bind_error_) {
+      : db_(other.db_), stmt_(other.stmt_), bind_error_(other.bind_error_) {
     other.db_ = nullptr;
     other.stmt_ = nullptr;
     other.bind_error_ = SQLITE_OK;
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index c9c76ea5f2cd30b8abe7e3c9766ce4946ca25200..1e88323d017bec4b2705c6dbb19005efb8adbaa9 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -33,9 +33,7 @@ class SqliteTest : public ::testing::Test {
     db_->PrepareOrDie("CREATE TABLE T (a BLOB, b BLOB)").StepAndResetOrDie();
   }
 
-  void TearDown() override {
-    db_->Unref();
-  }
+  void TearDown() override { db_->Unref(); }
 
   Sqlite* db_;
   bool is_done_;
@@ -213,7 +211,7 @@ TEST_F(SqliteTest, BindFailed) {
   Status s = stmt.StepOnce();
   EXPECT_NE(string::npos,
             s.error_message().find("INSERT INTO T (a) VALUES (123)"))
-            << s.error_message();
+      << s.error_message();
 }
 
 TEST_F(SqliteTest, SnappyExtension) {
@@ -226,7 +224,7 @@ TEST_F(SqliteTest, SnappyBinaryCompatibility) {
   EXPECT_EQ(
       "today is the end of the republic",
       db_->PrepareOrDie("SELECT UNSNAP(X'03207C746F6461792069732074686520656E64"
-                            "206F66207468652072657075626C6963')")
+                        "206F66207468652072657075626C6963')")
           .StepOnceOrDie()
           .ColumnString(0));
 }
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 0f6999c88fca3fd7ab91d2f3e28348e22d106f45..e5deb2b873e22249cc52323b1b29518e4255d48a 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -44,6 +44,14 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
   return 0;
 }
 
+static const char* GifErrorStringNonNull(int error_code) {
+  const char* error_string = GifErrorString(error_code);
+  if (error_string == nullptr) {
+    return "Unknown error";
+  }
+  return error_string;
+}
+
 uint8* Decode(const void* srcdata, int datasize,
               const std::function<uint8*(int, int, int, int)>& allocate_output,
               string* error_string) {
@@ -55,17 +63,17 @@ uint8* Decode(const void* srcdata, int datasize,
     int error_code = D_GIF_SUCCEEDED;
     if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) {
       LOG(WARNING) << "Fail to close gif file, reason: "
-                   << GifErrorString(error_code);
+                   << GifErrorStringNonNull(error_code);
     }
   });
   if (error_code != D_GIF_SUCCEEDED) {
     *error_string = strings::StrCat("failed to open gif file: ",
-                                    GifErrorString(error_code));
+                                    GifErrorStringNonNull(error_code));
     return nullptr;
   }
   if (DGifSlurp(gif_file) != GIF_OK) {
     *error_string = strings::StrCat("failed to slurp gif file: ",
-                                    GifErrorString(gif_file->Error));
+                                    GifErrorStringNonNull(gif_file->Error));
     return nullptr;
   }
   if (gif_file->ImageCount <= 0) {
diff --git a/tensorflow/core/lib/gtl/cleanup.h b/tensorflow/core/lib/gtl/cleanup.h
index 6053e986402598568299d1756d23068693c193c8..6bd60ca482430cf13f4f076badf460cf2e1d593b 100644
--- a/tensorflow/core/lib/gtl/cleanup.h
+++ b/tensorflow/core/lib/gtl/cleanup.h
@@ -55,22 +55,21 @@ namespace gtl {
 template <typename F>
 class Cleanup {
  public:
-  Cleanup()
-      : released_(true), f_() {}
+  Cleanup() : released_(true), f_() {}
 
   template <typename G>
-  explicit Cleanup(G&& f)  // NOLINT
+  explicit Cleanup(G&& f)          // NOLINT
       : f_(std::forward<G>(f)) {}  // NOLINT(build/c++11)
 
   Cleanup(Cleanup&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Implicitly move-constructible from any compatible Cleanup<G>.
   // The source will be released as if src.release() were called.
   // A moved-from Cleanup can be safely destroyed or reassigned.
   template <typename G>
   Cleanup(Cleanup<G>&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Assignment to a Cleanup object behaves like destroying it
   // and making a new one in its place, analogous to unique_ptr
@@ -102,8 +101,8 @@ class Cleanup {
   F f_;
 };
 
-template <int&... ExplicitParameterBarrier,
-          typename F, typename DecayF = typename std::decay<F>::type>
+template <int&... ExplicitParameterBarrier, typename F,
+          typename DecayF = typename std::decay<F>::type>
 TF_MUST_USE_RESULT Cleanup<DecayF> MakeCleanup(F&& f) {
   return Cleanup<DecayF>(std::forward<F>(f));
 }
diff --git a/tensorflow/core/lib/gtl/cleanup_test.cc b/tensorflow/core/lib/gtl/cleanup_test.cc
index bd151cb2ab1c8a830eb1bd9546ab452d05c6c20c..a86ffd5fe284485f15fa824026e8d79f5191a384 100644
--- a/tensorflow/core/lib/gtl/cleanup_test.cc
+++ b/tensorflow/core/lib/gtl/cleanup_test.cc
@@ -65,15 +65,14 @@ TEST(CleanupTest, Release) {
 TEST(FinallyTest, TypeErasedWithoutFactory) {
   string s = "active";
   {
-    AnyCleanup s_cleaner([&s]{ s.append(" clean"); });
+    AnyCleanup s_cleaner([&s] { s.append(" clean"); });
     EXPECT_EQ("active", s);
   }
   EXPECT_EQ("active clean", s);
 }
 
 struct Appender {
-  Appender(string* s, const string& msg)
-      : s_(s), msg_(msg) {}
+  Appender(string* s, const string& msg) : s_(s), msg_(msg) {}
   void operator()() const { s_->append(msg_); }
   string* s_;
   string msg_;
@@ -163,7 +162,12 @@ class CleanupReferenceTest : public ::testing::Test {
     int* i;
     F(int* cp, int* i) : cp(cp), i(i) {}
     F(const F& o) : cp(o.cp), i(o.i) { ++*cp; }
-    F& operator=(const F& o) { cp = o.cp; i = o.i; ++*cp; return *this; }
+    F& operator=(const F& o) {
+      cp = o.cp;
+      i = o.i;
+      ++*cp;
+      return *this;
+    }
     F(F&&) = default;
     F& operator=(F&&) = default;
     void operator()() const { ++*i; }
@@ -279,7 +283,7 @@ BENCHMARK(BM_AnyCleanup);
 
 void BM_AnyCleanupNoFactory(int iters) {
   while (iters--) {
-    AnyCleanup fin([]{Incr();});
+    AnyCleanup fin([] { Incr(); });
   }
 }
 BENCHMARK(BM_AnyCleanupNoFactory);
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index d6e5d9effa794c46b7aa98691bb993dbd7e764c8..6e3cb2206d9658a3b0bc24b506049f503ae304ed 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -31,12 +31,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 #define TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 
-#include <cstddef>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <algorithm>
+#include <cstddef>
 #include <iterator>
 #include <memory>
 #include <type_traits>
@@ -407,7 +407,7 @@ class InlinedVector {
   };
   // 2) Construct a T with args at not-yet-initialized memory pointed by dst.
   struct Construct {
-    template<class... Args>
+    template <class... Args>
     void operator()(T* dst, Args&&... args) const {
       new (dst) T(std::forward<Args>(args)...);
     }
diff --git a/tensorflow/core/lib/gtl/int_type.h b/tensorflow/core/lib/gtl/int_type.h
index 647fc81aa7e4925d1d2b74b82146b18b0c17a4a9..af3e50ad78ff9d07bc0e8e79a5ff7cb3d1aacbfe 100644
--- a/tensorflow/core/lib/gtl/int_type.h
+++ b/tensorflow/core/lib/gtl/int_type.h
@@ -255,13 +255,13 @@ class IntType {
     value_ op arg_value;                             \
     return *this;                                    \
   }
-  INT_TYPE_ASSIGNMENT_OP(+= );
-  INT_TYPE_ASSIGNMENT_OP(-= );
-  INT_TYPE_ASSIGNMENT_OP(*= );
-  INT_TYPE_ASSIGNMENT_OP(/= );
-  INT_TYPE_ASSIGNMENT_OP(<<= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(>>= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(%= );
+  INT_TYPE_ASSIGNMENT_OP(+=);
+  INT_TYPE_ASSIGNMENT_OP(-=);
+  INT_TYPE_ASSIGNMENT_OP(*=);
+  INT_TYPE_ASSIGNMENT_OP(/=);
+  INT_TYPE_ASSIGNMENT_OP(<<=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(>>=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(%=);
 #undef INT_TYPE_ASSIGNMENT_OP
 
   ThisType& operator=(ValueType arg_value) {
@@ -314,10 +314,10 @@ std::ostream& operator<<(std::ostream& os,  // NOLINT
 INT_TYPE_ARITHMETIC_OP(+);
 INT_TYPE_ARITHMETIC_OP(-);
 INT_TYPE_ARITHMETIC_OP(*);
-INT_TYPE_ARITHMETIC_OP(/ );
-INT_TYPE_ARITHMETIC_OP(<< );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(>> );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(% );
+INT_TYPE_ARITHMETIC_OP(/);
+INT_TYPE_ARITHMETIC_OP(<<);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(>>);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(%);
 #undef INT_TYPE_ARITHMETIC_OP
 
 // -- NON-MEMBER COMPARISON OPERATORS ------------------------------------------
@@ -345,12 +345,12 @@ INT_TYPE_ARITHMETIC_OP(% );
       IntType<IntTypeName, ValueType> id) {                      \
     return val op id.value();                                    \
   }
-INT_TYPE_COMPARISON_OP(== );  // NOLINT
-INT_TYPE_COMPARISON_OP(!= );  // NOLINT
-INT_TYPE_COMPARISON_OP(< );   // NOLINT
-INT_TYPE_COMPARISON_OP(<= );  // NOLINT
-INT_TYPE_COMPARISON_OP(> );   // NOLINT
-INT_TYPE_COMPARISON_OP(>= );  // NOLINT
+INT_TYPE_COMPARISON_OP(==);  // NOLINT
+INT_TYPE_COMPARISON_OP(!=);  // NOLINT
+INT_TYPE_COMPARISON_OP(<);   // NOLINT
+INT_TYPE_COMPARISON_OP(<=);  // NOLINT
+INT_TYPE_COMPARISON_OP(>);   // NOLINT
+INT_TYPE_COMPARISON_OP(>=);  // NOLINT
 #undef INT_TYPE_COMPARISON_OP
 
 }  // namespace gtl
diff --git a/tensorflow/core/lib/gtl/int_type_test.cc b/tensorflow/core/lib/gtl/int_type_test.cc
index d3c405d9acdb221f465e98d957ba55ba6bc63f57..61d364017cb90933e8e9af7e800db4a6988d8442 100644
--- a/tensorflow/core/lib/gtl/int_type_test.cc
+++ b/tensorflow/core/lib/gtl/int_type_test.cc
@@ -42,7 +42,8 @@ class IntTypeTest : public ::testing::Test {
 
 // All tests below will be executed on all supported IntTypes.
 typedef ::testing::Types<Int8_IT, UInt8_IT, Int16_IT, UInt16_IT, Int32_IT,
-                         Int64_IT, UInt64_IT, Long_IT> SupportedIntTypes;
+                         Int64_IT, UInt64_IT, Long_IT>
+    SupportedIntTypes;
 
 TYPED_TEST_CASE(IntTypeTest, SupportedIntTypes);
 
@@ -232,7 +233,8 @@ TYPED_TEST(IntTypeTest, TestOperators) {
 
 TYPED_TEST(IntTypeTest, TestHashFunctor) {
   std::unordered_map<typename TestFixture::T, char,
-                     typename TestFixture::T::Hasher> map;
+                     typename TestFixture::T::Hasher>
+      map;
   typename TestFixture::T a(0);
   map[a] = 'c';
   EXPECT_EQ('c', map[a]);
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 2ff8b9c7d1adbbc206e0429142389e9730efa33c..fa33c24c0c006aa5d3fed5102980da865e12696a 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -593,12 +593,12 @@ class optional : private internal_optional::optional_data<T>,
     assert(this->engaged_);
     return this->pointer();
   }
-  constexpr const T& operator*() const & { return reference(); }
+  constexpr const T& operator*() const& { return reference(); }
   T& operator*() & {
     assert(this->engaged_);
     return reference();
   }
-  constexpr const T&& operator*() const && { return std::move(reference()); }
+  constexpr const T&& operator*() const&& { return std::move(reference()); }
   T&& operator*() && {
     assert(this->engaged_);
     return std::move(reference());
@@ -621,7 +621,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value()` to get a reference to underlying value.  The constness
   // and lvalue/rvalue-ness of `opt` is preserved to the view of the T
   // subobject.
-  const T& value() const & {
+  const T& value() const& {
     CHECK(*this) << "Bad optional access";
     return reference();
   }
@@ -633,7 +633,7 @@ class optional : private internal_optional::optional_data<T>,
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
-  const T&& value() const && {  // NOLINT(build/c++11)
+  const T&& value() const&& {  // NOLINT(build/c++11)
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
@@ -641,7 +641,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value_or(val)` to get either the value of T or the given default
   // `val` in the empty case.
   template <class U>
-  constexpr T value_or(U&& v) const & {
+  constexpr T value_or(U&& v) const& {
     return static_cast<bool>(*this) ? **this
                                     : static_cast<T>(std::forward<U>(v));
   }
@@ -656,8 +656,8 @@ class optional : private internal_optional::optional_data<T>,
   constexpr const T& reference() const { return *this->pointer(); }
   T& reference() { return *(this->pointer()); }
 
-  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t or
-  // a reference.
+  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t
+  // or a reference.
   static_assert(
       !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
       "optional<nullopt_t> is not allowed.");
diff --git a/tensorflow/core/lib/gtl/optional_test.cc b/tensorflow/core/lib/gtl/optional_test.cc
index 547bee7b75f3d05e290ec7d53d889ff7e82794a9..12b5bbc60be9961a5f852210c42479b2cd48ea92 100644
--- a/tensorflow/core/lib/gtl/optional_test.cc
+++ b/tensorflow/core/lib/gtl/optional_test.cc
@@ -24,17 +24,29 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-using tensorflow::gtl::optional;
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::nullopt_t;
 using tensorflow::gtl::in_place;
 using tensorflow::gtl::in_place_t;
 using tensorflow::gtl::make_optional;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::nullopt_t;
+using tensorflow::gtl::optional;
 
-template <typename T> string TypeQuals(T&) { return "&"; }
-template <typename T> string TypeQuals(T&&) { return "&&"; }
-template <typename T> string TypeQuals(const T&) { return "c&"; }
-template <typename T> string TypeQuals(const T&&) { return "c&&"; }
+template <typename T>
+string TypeQuals(T&) {
+  return "&";
+}
+template <typename T>
+string TypeQuals(T&&) {
+  return "&&";
+}
+template <typename T>
+string TypeQuals(const T&) {
+  return "c&";
+}
+template <typename T>
+string TypeQuals(const T&&) {
+  return "c&&";
+}
 
 struct StructorListener {
   int construct0 = 0;
diff --git a/tensorflow/core/lib/gtl/top_n_test.cc b/tensorflow/core/lib/gtl/top_n_test.cc
index fae85570dc071568a53abcb72fea6ffc22a465ea..ba30c072a9033073a7439f60dbfa3402dbfc5923 100644
--- a/tensorflow/core/lib/gtl/top_n_test.cc
+++ b/tensorflow/core/lib/gtl/top_n_test.cc
@@ -28,10 +28,10 @@ limitations under the License.
 
 namespace {
 
+using tensorflow::string;
 using tensorflow::gtl::TopN;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::string;
 
 // Move the contents from an owned raw pointer, returning by value.
 // Objects are easier to manage by value.
diff --git a/tensorflow/core/lib/io/compression.cc b/tensorflow/core/lib/io/compression.cc
index c12de98e40105907460f74f967e20aa41bdb0ceb..0d25bca9eccf2b28800a288858ffbc0caeb2dbd3 100644
--- a/tensorflow/core/lib/io/compression.cc
+++ b/tensorflow/core/lib/io/compression.cc
@@ -22,6 +22,6 @@ namespace compression {
 const char kNone[] = "";
 const char kGzip[] = "GZIP";
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/compression.h b/tensorflow/core/lib/io/compression.h
index ef90c60a3a411cdc94a9f92522116db340e04f1b..4d8e7788cad823e0e79a4e9567c6f17a3d9259cf 100644
--- a/tensorflow/core/lib/io/compression.h
+++ b/tensorflow/core/lib/io/compression.h
@@ -23,8 +23,8 @@ namespace compression {
 extern const char kNone[];
 extern const char kGzip[];
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 8b8c1392a1dce339a56b718af036248f22ba0b59..09336e79cda67b324299d78c65217e6a7b40dc21 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -57,6 +57,43 @@ Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read,
   return Status::OK();
 }
 
+// To limit memory usage, the default implementation of SkipNBytes() only reads
+// 8MB at a time.
+static constexpr int64 kMaxSkipSize = 8 * 1024 * 1024;
+
+Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
+  if (bytes_to_skip < 0) {
+    return errors::InvalidArgument("Can't skip a negative number of bytes");
+  }
+  std::unique_ptr<char[]> scratch(new char[kMaxSkipSize]);
+  // Try to read 1 bytes first, if we could complete the read then EOF is
+  // not reached yet and we could return.
+  if (bytes_to_skip > 0) {
+    StringPiece data;
+    Status s = file_->Read(pos_ + bytes_to_skip - 1, 1, &data, scratch.get());
+    if ((s.ok() || errors::IsOutOfRange(s)) && data.size() == 1) {
+      pos_ += bytes_to_skip;
+      return Status::OK();
+    }
+  }
+  // Read kDefaultSkipSize at a time till bytes_to_skip.
+  while (bytes_to_skip > 0) {
+    int64 bytes_to_read = std::min<int64>(kMaxSkipSize, bytes_to_skip);
+    StringPiece data;
+    Status s = file_->Read(pos_, bytes_to_read, &data, scratch.get());
+    if (s.ok() || errors::IsOutOfRange(s)) {
+      pos_ += data.size();
+    } else {
+      return s;
+    }
+    if (data.size() < bytes_to_read) {
+      return errors::OutOfRange("reached end of file");
+    }
+    bytes_to_skip -= bytes_to_read;
+  }
+  return Status::OK();
+}
+
 int64 RandomAccessInputStream::Tell() const { return pos_; }
 
 }  // namespace io
diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h
index 09ebe9ba49e741945457c82cf0c64b3c1268a694..bdbdbd71ff914cfaf1690b2813ddbab070a9f99a 100644
--- a/tensorflow/core/lib/io/random_inputstream.h
+++ b/tensorflow/core/lib/io/random_inputstream.h
@@ -34,6 +34,8 @@ class RandomAccessInputStream : public InputStreamInterface {
 
   Status ReadNBytes(int64 bytes_to_read, string* result) override;
 
+  Status SkipNBytes(int64 bytes_to_skip) override;
+
   int64 Tell() const override;
 
   Status Seek(int64 position) {
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 403c82818ef3293a1dc027d362eb766906d0e94a..9cc6c4034f485c497747d102d7d731e5cd68a4d0 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -207,7 +207,7 @@ Status RecordReader::SkipNBytes(uint64 offset) {
     }
   }
   return Status::OK();
-}
+}  // namespace io
 
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 507c26a63ff587809e80739f8d015d1adcc3b21d..b7e51256a22b0d84e734e2a036a184b3adc3e547 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -218,8 +218,8 @@ TEST_F(RecordioTest, RandomRead) {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST_F(RecordioTest, ReadError) {
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 354c819b090ce5e04047f13d2ff19441a499d770..77a3414442caa523ab7a92e3e63babf581030287 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -197,8 +197,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
                       int desired_channel_bits, DecodeContext* context) {
   CHECK(desired_channel_bits == 8 || desired_channel_bits == 16)
       << "desired_channel_bits = " << desired_channel_bits;
-  CHECK(0 <= desired_channels && desired_channels <= 4) << "desired_channels = "
-                                                        << desired_channels;
+  CHECK(0 <= desired_channels && desired_channels <= 4)
+      << "desired_channels = " << desired_channels;
   context->error_condition = false;
   context->channels = desired_channels;
   context->png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, context,
diff --git a/tensorflow/core/lib/random/philox_random_test_utils.h b/tensorflow/core/lib/random/philox_random_test_utils.h
index f4bb087e107e10f90196a807c03ed2407d9d1ad6..6c29ae6b6a224d9c0369172bbf21af465ad53a19 100644
--- a/tensorflow/core/lib/random/philox_random_test_utils.h
+++ b/tensorflow/core/lib/random/philox_random_test_utils.h
@@ -35,8 +35,8 @@ void FillRandoms(PhiloxRandom gen, typename Distribution::ResultElementType* p,
                  int64 size) {
   const int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   Distribution dist;
   for (int i = 0; i < size; i += granularity) {
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 0e281403f8748ffbb7dbfac888cd2303c0a7253f..3fe1f9bc6cf06158df4811eaa177988b60890006 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
 
 #define _USE_MATH_DEFINES
-#include <cmath>
 #include <math.h>
+#include <cmath>
 #undef _USE_MATH_DEFINES
 
 #include <string.h>
@@ -27,7 +27,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/random/philox_random.h"
 
-
 namespace tensorflow {
 namespace random {
 
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 90d0dba4a7793f51472b2e5434489448eb40a498..85d68f456e1e27b7a62315f2b0a962843da87d52 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -45,8 +45,8 @@ void FillRandomsWithSingles(PhiloxRandom gen,
                             int64 size) {
   int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
 
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index af9a15125948d8ed390e5873f3677527ebddea8e..ef90050b4f628ab65c1dd939ba358fec714c95b5 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -472,7 +472,8 @@ void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) {
   // buf = val in network byte order, sign extended to 10 bytes
   const char sign_byte = val < 0 ? '\xff' : '\0';
   char buf[10] = {
-      sign_byte, sign_byte,
+      sign_byte,
+      sign_byte,
   };
   StoreBigEndian64(buf + 2, val);
   static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 5835b0101d9ede219a71acf554c5928e4b624ce7..2bc14945cd0413751003c03c7f5255c300790321 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -126,7 +126,7 @@ class AlphaNum {
       : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {}
 
   AlphaNum(const Eigen::half &f);  // NOLINT(runtime/explicit)
-  AlphaNum(Hex hex);  // NOLINT(runtime/explicit)
+  AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
   AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
   AlphaNum(const StringPiece &pc) : piece_(pc) {}  // NOLINT(runtime/explicit)
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 279a5876f962bb32b09a4b832794dfdcfffc6d46..267ce88440080399aae783903503f0bbd025d8b4 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -335,6 +335,13 @@ REGISTER_OP("Unpack")
       return Status::OK();
     });
 
+REGISTER_OP("UnravelIndex")
+    .Input("indices: Tidx")
+    .Input("dims: Tidx")
+    .Output("output: Tidx")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
@@ -701,10 +708,11 @@ REGISTER_OP("MatrixDiagPart")
 // --------------------------------------------------------------------------
 REGISTER_OP("MatrixBandPart")
     .Input("input: T")
-    .Input("num_lower: int64")
-    .Input("num_upper: int64")
+    .Input("num_lower: Tindex")
+    .Input("num_upper: Tindex")
     .Output("band: T")
     .Attr("T: type")
+    .Attr("Tindex: {int32, int64} = DT_INT64")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
@@ -977,8 +985,8 @@ REGISTER_OP("GatherNd")
       if (c->Value(r_dim) > c->Rank(params)) {
         return errors::InvalidArgument(
             "indices.shape[-1] must be <= params.rank, but saw indices shape: ",
-            c->DebugString(indices), " and params shape: ",
-            c->DebugString(params));
+            c->DebugString(indices),
+            " and params shape: ", c->DebugString(params));
       }
 
       // Remove r_dim from indices to get output.
@@ -1252,12 +1260,12 @@ REGISTER_OP("ReverseSequence")
       // Validate batch_dim and seq_dim against input.
       const int32 input_rank = c->Rank(input);
       if (batch_dim >= input_rank) {
-        return errors::InvalidArgument("batch_dim must be < input rank: ",
-                                       batch_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "batch_dim must be < input rank: ", batch_dim, " vs. ", input_rank);
       }
       if (seq_dim >= input_rank) {
-        return errors::InvalidArgument("seq_dim must be < input rank: ",
-                                       seq_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "seq_dim must be < input rank: ", seq_dim, " vs. ", input_rank);
       }
 
       DimensionHandle batch_dim_dim = c->Dim(input, batch_dim);
@@ -2638,8 +2646,9 @@ Status ScatterNdShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ", outer_dims,
+            "The outer ", outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", outer_dims,
             " dimensions of updates.shape=", c->DebugString(updates_shape),
             ": ", s.error_message());
       }
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index a182fd1c475ad44dcd0f05d42a9cbd6eeab16469..86d64635f4c1bc1c34407a517267758ce5cf60fc 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -142,8 +142,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) {
 
 TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   for (const char* op_name : {
-           "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize",
-           "StopGradient", "ZerosLike", "OnesLike",
+           "CheckNumerics",
+           "Identity",
+           "RefIdentity",
+           "QuantizeAndDequantize",
+           "StopGradient",
+           "ZerosLike",
+           "OnesLike",
        }) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?", "in0");
diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc
index c79b4439148e5795e313c71bbce35c82242cd335..f367371604097b7a500d746a3b8a8a5906082cbb 100644
--- a/tensorflow/core/ops/candidate_sampling_ops_test.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc
@@ -23,9 +23,12 @@ namespace tensorflow {
 
 TEST(CandidateSamplerOpsTest, CandidateSampler_ShapeFn) {
   for (const char* op_name : {
-           "AllCandidateSampler", "FixedUnigramCandidateSampler",
-           "LearnedUnigramCandidateSampler", "LogUniformCandidateSampler",
-           "ThreadUnsafeUnigramCandidateSampler", "UniformCandidateSampler",
+           "AllCandidateSampler",
+           "FixedUnigramCandidateSampler",
+           "LearnedUnigramCandidateSampler",
+           "LogUniformCandidateSampler",
+           "ThreadUnsafeUnigramCandidateSampler",
+           "UniformCandidateSampler",
        }) {
     ShapeInferenceTestOp op(op_name);
     TF_ASSERT_OK(NodeDefBuilder("test", op.name)
diff --git a/tensorflow/core/ops/compat/backwards_compatibility_test.cc b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
index add05d6610ae62158b653d27699f61bc511ee3b6..6e05ae4be4fb967ac8dcc5a03fa548c7cb6c0f9b 100644
--- a/tensorflow/core/ops/compat/backwards_compatibility_test.cc
+++ b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
@@ -25,8 +25,9 @@ namespace tensorflow {
 namespace {
 
 TEST(BackwardsCompatibilityTest, IsCompatible) {
-  OpCompatibilityLib compatibility(
-      "tensorflow/core/ops", strings::StrCat("v", TF_MAJOR_VERSION), nullptr);
+  OpCompatibilityLib compatibility("tensorflow/core/ops",
+                                   strings::StrCat("v", TF_MAJOR_VERSION),
+                                   nullptr);
 
   Env* env = Env::Default();
   int changed_ops = 0;
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 65ab81931ad4261f432034f73269d1e8c8005384..177561161ea2fc98ac7814ee147f0cf03225298e 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -17136,6 +17136,24 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Enter"
   input_arg {
@@ -24840,6 +24858,42 @@ op {
     type: "type"
   }
 }
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "MatrixDeterminant"
   input_arg {
@@ -32096,6 +32150,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PreventGradient"
   input_arg {
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 12c27c79840de6981629984732147671b8a1e28e..4f946fb3ca7608816180351b7753d01f13d469f2 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -171,29 +171,10 @@ Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
   return Status::OK();
 }
 
-Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
-  ShapeHandle handle;
-  DimensionHandle unused_handle;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-  for (int i = 1; i < c->num_inputs(); ++i) {
-    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
-    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
-  }
-  for (int i = 0; i < c->num_outputs(); ++i) {
-    c->set_output(i, c->Scalar());
-  }
-  return Status::OK();
-}
-
 Status TwoElementOutput(InferenceContext* c) {
   c->set_output(0, c->Vector(2));
   return Status::OK();
 }
-
-Status ScalarOutput(InferenceContext* c) {
-  c->set_output(0, c->Scalar());
-  return Status::OK();
-}
 }  // namespace
 
 REGISTER_OP("RandomShuffleQueue")
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 2cae814eab1602e72ffcfd100f9813f8f41c6ac9..3c8e9a8a5f2e1e0d1b26da7580fe2e5e0d1771dd 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -491,4 +491,29 @@ REGISTER_OP("StatsAggregatorSummary")
     .Output("summary: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("padded_shapes: N * int64")
+    .Input("padding_values: Toutput_types")
+    .Output("handle: variant")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
+    // of `Toutput_types` and `output_shapes` are `N`, that the
+    // length of `output_types` is `N`, the `output_shapes` are
+    // (as far as possible to tell statically) compatible with `padded_shapes`,
+    // and that `padding_values` are all scalars.
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("EnqueueInQueueDataset")
+    .Input("queue: variant")
+    .Input("components: Tcomponents")
+    .Attr("Tcomponents: list(type) >= 1")
+    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
+    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
+    // reading from queue handle (is that even possible?).
+    .SetShapeFn(shape_inference::NoOutputs);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_grad.cc b/tensorflow/core/ops/functional_grad.cc
index 6df3536795ce7772faef72d63e0cb276719d7b44..eeccb72da65d7cef1073f54bf7f639436f69e930 100644
--- a/tensorflow/core/ops/functional_grad.cc
+++ b/tensorflow/core/ops/functional_grad.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/function.h"
 #include <vector>
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 7484ebb07808a7670d80a4bfdb590e85b94de04f..a62e2d782b8d542b98494cc42ccf6f86d295efd0 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -25,42 +25,6 @@ using shape_inference::ShapeHandle;
 
 namespace {
 
-const char kDecodeJpegCommonDocStr[] = R"doc(
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the JPEG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-
-If needed, the JPEG-encoded image is transformed to match the requested number
-of color channels.
-
-The attr `ratio` allows downscaling the image by an integer factor during
-decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-downscaling the image later.
-
-)doc";
-
-const char kDecodeJpegCommonParamsDocStr[] = R"doc(
-channels: Number of color channels for the decoded image.
-ratio: Downscaling ratio.
-fancy_upscaling: If true use a slower but nicer upscaling of the
-  chroma planes (yuv420/422 only).
-try_recover_truncated:  If true try to recover an image from truncated input.
-acceptable_fraction: The minimum required fraction of lines before a truncated
-  input is accepted.
-dct_method: string specifying a hint about the algorithm used for
-  decompression.  Defaults to "" which maps to a system-specific
-  default.  Currently valid values are ["INTEGER_FAST",
-  "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-  jpeg library changes to a version that does not have that specific
-  option.)
-image: 3-D with shape `[height, width, channels]`..
-)doc";
-
 // Sets output[0] to shape [batch_dim,height,width,channel_dim], where
 // height and width come from the size_tensor.
 Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
@@ -622,6 +586,17 @@ REGISTER_OP("NonMaxSuppression")
     .Output("selected_indices: int32")
     .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
@@ -633,6 +608,19 @@ REGISTER_OP("NonMaxSuppressionV2")
     .Input("iou_threshold: float")
     .Output("selected_indices: int32")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      ShapeHandle iou_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index a67267418d608e7c824030225f906b010794a160..50ea8ad01a7189a246f4bd08403a529f3f5bd9a0 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -102,6 +103,8 @@ REGISTER_OP("LookupTableFindV2")
       c->set_output(0, c->UnknownShape());
       return Status::OK();
     });
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2");
+// TODO(b/72710477): Update this.
 
 REGISTER_OP("LookupTableInsert")
     .Input("table_handle: Ref(string)")
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95b4774fe6e230800e71d237c2cd027acf6e054b
--- /dev/null
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Roll")
+    .Input("input: T")
+    .Input("shift: Tshift")
+    .Input("axis: Taxis")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tshift: {int32,int64}")
+    .Attr("Taxis: {int32,int64}")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index dd484c3ee752b47f4a196cd45c6e26984b5ef0bd..872ebe98c1f331ca882480282d3f8eecf4ce5f2d 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1172,12 +1172,12 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
   T limit = limit_t->scalar<T>()();
   T delta = delta_t->scalar<T>()();
   if (start > limit && delta > 0) {
-    return errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start <= limit when delta > 0: ", start, "/", limit);
   }
   if (start < limit && delta < 0) {
-    return errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start >= limit when delta < 0: ", start, "/", limit);
   }
   if (delta == 0) {
     return errors::InvalidArgument("Requires delta != 0");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 3f72b415699562a0d79fc1f41ff1b4a360bfc7db..67481fd202b3c3b35033b72e4c1c5fd294d98696 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1155,9 +1155,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1211,9 +1211,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
@@ -1818,7 +1818,7 @@ REGISTER_OP("_MklMaxPool")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1844,7 +1844,7 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
@@ -1916,7 +1916,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1944,7 +1944,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index b57206c9c4f53fbf73537f466206f5c1b0caefcb..2cd8d8a03bfb16dd98288d5a19dafc5bb1932e3e 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7644,6 +7644,24 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Enter"
   input_arg {
@@ -12330,11 +12348,11 @@ op {
   }
   input_arg {
     name: "num_lower"
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   input_arg {
     name: "num_upper"
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   output_arg {
     name: "band"
@@ -12344,6 +12362,19 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "MatrixDeterminant"
@@ -15926,6 +15957,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PreventGradient"
   input_arg {
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index e67d95fa8cb8466365bf12a46a123de174103d0f..4025070adb2b193edacdaf728f240961bf9d2530 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::ShapeHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // --------------------------------------------------------------------------
 static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) {
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 8beb28de0a2e32832b2db60eeb8272a88536e91f..e4c5bcfb540660a609aca013b795d566e69f54a8 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -137,9 +137,9 @@ REGISTER_OP("Substr")
         DimensionHandle pos_dim = c->Dim(pos_shape, i);
         DimensionHandle len_dim = c->Dim(len_shape, i);
         if (c->Value(pos_dim) != c->Value(len_dim)) {
-          return errors::InvalidArgument("pos and len shapes must match: ",
-                                         c->DebugString(pos_shape), " vs. ",
-                                         c->DebugString(len_shape));
+          return errors::InvalidArgument(
+              "pos and len shapes must match: ", c->DebugString(pos_shape),
+              " vs. ", c->DebugString(len_shape));
         }
       }
       // c->input(0) is the ShapeHandle to input strings
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index e8d03877c91402394567b05df8b738de1c15c8c6..6ce9595fb60b78525bde19515077f7245a219d39 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -22,48 +22,6 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-const char kAddSignCommonDocStr[] = R"doc(
-Update '*var' according to the AddSign update.
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-variable <- variable - lr_t * update
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-sign_decay: Must be a scalar.
-alpha: Must be a scalar.
-beta: Must be a scalar.
-grad: The gradient.
-)doc";
-
-const char kPowerSignCommonDocStr[] = R"doc(
-Update '*var' according to the AddSign update.
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-variable <- variable - lr_t * update
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-logbase: Must be a scalar.
-sign_decay: Must be a scalar.
-beta: Must be a scalar.
-grad: The gradient.
-)doc";
-
-const char kOutDocStr[] = R"doc(
-out: Same as "var".
-)doc";
-
-const char kLockDocStr[] = R"doc(
-use_locking: If `True`, updating of the var and m tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc";
-
 static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
   auto* handle_data = c->input_handle_shapes_and_types(input);
   if (handle_data != nullptr && !handle_data->empty() &&
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index de4e3cd9e70014ea9b29d4d473d94c0abb52eabc..0f309c1f4e956c98b6f20fa3b6c810116a2b339c 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -24,7 +24,7 @@ static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
                                             const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
-      const char* var_spec, const char* grad_indices_spec) {
+                        const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
                            grad_indices_spec, shape_spec_end);
   };
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 07aecf848326b23b18b58ae60e896150ab7b4ef9..9ba25dea4fb278cbfaf4080e21beef8a3e9de769 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -57,6 +57,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gcs_throttle",
+    srcs = ["gcs_throttle.cc"],
+    hdrs = ["gcs_throttle.h"],
+    copts = tf_copts(),
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "gcs_file_system",
     srcs = ["gcs_file_system.cc"],
@@ -69,6 +80,7 @@ cc_library(
         ":expiring_lru_cache",
         ":file_block_cache",
         ":gcs_dns_cache",
+        ":gcs_throttle",
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
@@ -271,6 +283,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gcs_throttle_test",
+    size = "small",
+    srcs = ["gcs_throttle_test.cc"],
+    linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
+    deps = [
+        ":gcs_throttle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "curl_http_request_test",
     size = "small",
diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc
index 0375af516b0504e8b527409ba22da0caa149ad9d..6add1142a15fb69044828bd82a6d6e838959de08 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache.cc
@@ -131,6 +131,7 @@ Status FileBlockCache::MaybeFetch(const Key& key,
         block->mu.lock();  // Reacquire the lock immediately afterwards
         if (status.ok()) {
           block->data.resize(bytes_transferred, 0);
+          block->data.shrink_to_fit();
           downloaded_block = true;
           block->state = FetchState::FINISHED;
         } else {
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index 2b0e55bf371da9660f1422cef97e3ec1a25a9b61..4d9aff4d24f06c7bd1269ad590c9687092a5b132 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <arpa/inet.h>
 #include <netdb.h>
 #else
+#include <Windows.h>
 #include <winsock2.h>
 #include <ws2tcpip.h>
-#include <Windows.h>
 #endif
 #include <sys/types.h>
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 520720372d9ff12556110967d2c47703ec4b5132..01ca0d76bab2720513775ef33ff8670bd148c241 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-
 namespace {
 
 constexpr char kGcsUriBase[] = "https://www.googleapis.com/storage/v1/";
@@ -59,9 +58,6 @@ constexpr char kGcsUploadUriBase[] =
 constexpr char kStorageHost[] = "storage.googleapis.com";
 constexpr size_t kReadAppendableFileBufferSize = 1024 * 1024;  // In bytes.
 constexpr int kGetChildrenDefaultPageSize = 1000;
-// Initial delay before retrying a GCS upload.
-// Subsequent delays can be larger due to exponential back-off.
-constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
@@ -120,6 +116,15 @@ constexpr char kWriteRequestTimeout[] = "GCS_WRITE_REQUEST_TIMEOUT_SECS";
 // The environment variable to configure an additional header to send with
 // all requests to GCS (format HEADERNAME:HEADERCONTENT)
 constexpr char kAdditionalRequestHeader[] = "GCS_ADDITIONAL_REQUEST_HEADER";
+// The environment variable to configure the throttle (format: <int64>)
+constexpr char kThrottleRate[] = "GCS_THROTTLE_TOKEN_RATE";
+// The environment variable to configure the token bucket size (format: <int64>)
+constexpr char kThrottleBucket[] = "GCS_THROTTLE_BUCKET_SIZE";
+// The environment variable that controls the number of tokens per request.
+// (format: <int64>)
+constexpr char kTokensPerRequest[] = "GCS_TOKENS_PER_REQUEST";
+// The environment variable to configure the initial tokens (format: <int64>)
+constexpr char kInitialTokens[] = "GCS_INITIAL_TOKENS";
 
 // TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
@@ -725,6 +730,26 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kWriteRequestTimeout, strings::safe_strtou32, &timeout_value)) {
     timeouts_.write = timeout_value;
   }
+
+  int64 token_value;
+  if (GetEnvVar(kThrottleRate, strings::safe_strto64, &token_value)) {
+    GcsThrottleConfig config;
+    config.enabled = true;
+    config.token_rate = token_value;
+
+    if (GetEnvVar(kThrottleBucket, strings::safe_strto64, &token_value)) {
+      config.bucket_size = token_value;
+    }
+
+    if (GetEnvVar(kTokensPerRequest, strings::safe_strto64, &token_value)) {
+      config.tokens_per_request = token_value;
+    }
+
+    if (GetEnvVar(kInitialTokens, strings::safe_strto64, &token_value)) {
+      config.initial_tokens = token_value;
+    }
+    throttle_.SetConfig(config);
+  }
 }
 
 GcsFileSystem::GcsFileSystem(
@@ -778,7 +803,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object));
 
   std::unique_ptr<HttpRequest> request;
-  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                  "when reading gs://", bucket, "/", object);
+
   request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket, "/",
                                   request->EscapeString(object)));
   request->SetRange(offset, offset + n - 1);
@@ -793,6 +820,8 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
           << offset << " of size: " << bytes_read;
 
+  throttle_.RecordResponse(bytes_read);
+
   if (bytes_read < block_size()) {
     // Check stat cache to see if we encountered an interrupted read.
     FileStatistics stat;
@@ -930,41 +959,43 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
         "'object' must be a non-empty string. (File: %s)", fname.c_str()));
   }
 
-  StatCache::ComputeFunc compute_func =
-      [this, &bucket, &object](const string& fname, FileStatistics* stat) {
-        std::vector<char> output_buffer;
-        std::unique_ptr<HttpRequest> request;
-        TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
-        request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
-                                        request->EscapeString(object),
-                                        "?fields=size%2Cupdated"));
-        request->SetResultBuffer(&output_buffer);
-        request->SetTimeouts(timeouts_.connect, timeouts_.idle,
-                             timeouts_.metadata);
+  StatCache::ComputeFunc compute_func = [this, &bucket, &object](
+                                            const string& fname,
+                                            FileStatistics* stat) {
+    std::vector<char> output_buffer;
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
 
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
-                                        " when reading metadata of gs://",
-                                        bucket, "/", object);
+    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
+                                    request->EscapeString(object),
+                                    "?fields=size%2Cupdated"));
+    request->SetResultBuffer(&output_buffer);
+    request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
 
-        Json::Value root;
-        TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
 
-        // Parse file size.
-        TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
+    Json::Value root;
+    TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
 
-        // Parse file modification time.
-        string updated;
-        TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-        TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+    // Parse file size.
+    TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
 
-        VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
-                << " length: " << stat->length
-                << "; mtime_nsec: " << stat->mtime_nsec
-                << "; updated: " << updated;
+    // Parse file modification time.
+    string updated;
+    TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+    TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
 
-        stat->is_directory = false;
-        return Status::OK();
-      };
+    VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+            << " length: " << stat->length
+            << "; mtime_nsec: " << stat->mtime_nsec << "; updated: " << updated;
+
+    stat->is_directory = false;
+    return Status::OK();
+  };
 
   TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
   if (stat->is_directory) {
@@ -1442,6 +1473,10 @@ Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
                            additional_header_->second);
   }
 
+  if (!throttle_.AdmitRequest()) {
+    return errors::Unavailable("Request throttled");
+  }
+
   *request = std::move(new_request);
   return Status::OK();
 }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 2eae39608e38184450290e86bc12d81494bb8302..e8edde8a445aad4c0310394d89480dc6ae445dfa 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/expiring_lru_cache.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -194,6 +195,7 @@ class GcsFileSystem : public FileSystem {
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<FileBlockCache> file_block_cache_;
   std::unique_ptr<GcsDnsCache> dns_cache_;
+  GcsThrottle throttle_;
 
   using StatCache = ExpiringLRUCache<FileStatistics>;
   std::unique_ptr<StatCache> stat_cache_;
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb5f8958a37f45aeac1a836ca037f91931bb34a6
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.cc
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+
+#include <algorithm>
+
+namespace tensorflow {
+
+GcsThrottle::GcsThrottle(EnvTime* env_time)
+    : last_updated_secs_(env_time->NowSeconds()),
+      available_tokens_(0),
+      env_time_(env_time) {}
+
+bool GcsThrottle::AdmitRequest() {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return true;
+  UpdateState();
+  if (available_tokens_ < config_.tokens_per_request) {
+    return false;
+  }
+  available_tokens_ -= config_.tokens_per_request;
+  return true;
+}
+
+void GcsThrottle::RecordResponse(size_t num_bytes) {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return;
+  UpdateState();
+  available_tokens_ -= request_bytes_to_tokens(num_bytes);
+}
+
+void GcsThrottle::SetConfig(GcsThrottleConfig config) {
+  mutex_lock l(mu_);
+  config_ = config;
+  available_tokens_ = config.initial_tokens;
+  last_updated_secs_ = env_time_->NowSeconds();
+}
+
+void GcsThrottle::UpdateState() {
+  // TODO(b/72643279): Switch to a monotonic clock.
+  int64 now = env_time_->NowSeconds();
+  uint64 delta_secs =
+      std::max(0LL, now - static_cast<int64>(last_updated_secs_));
+  available_tokens_ += delta_secs * config_.token_rate;
+  available_tokens_ = std::min(available_tokens_, config_.bucket_size);
+  last_updated_secs_ = now;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a89daef084e921f1ad8bd856cefcc62d0d7aa1c
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -0,0 +1,156 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+/**
+ * GcsThrottleConfig is used to configure the GcsThrottle.
+ */
+struct GcsThrottleConfig {
+  /**
+   * enabled is true if GcsThrottle should throttle requests, false otherwise.
+   */
+  bool enabled = false;
+
+  /**
+   * token_rate is the number of tokens accrued every second that can be used
+   * for making requests to the GCS service.
+   */
+  int64 token_rate = 100000;  // Approximately 800 MBits/second bandwidth-only.
+
+  /**
+   * bucket_size is the maximum number of available tokens the GcsThrottle can
+   * accrue.
+   */
+  int64 bucket_size = 10000000;  // 10 million tokens total
+
+  /**
+   * tokens_per_request determines the number of tokens consumed for every
+   * request.
+   *
+   * Note: tokens are also consumed in proportion to the response size.
+   */
+  int64 tokens_per_request = 100;
+
+  /**
+   * initial_tokens determines how many tokens should be available immediately
+   * after the GcsThrottle is constructed.
+   */
+  int64 initial_tokens = 0;
+};
+
+/**
+ * GcsThrottle is used to ensure fair use of the available GCS capacity.
+ *
+ * GcsThrottle operates around a concept of tokens. Tokens are consumed when
+ * making requests to the GCS service. Tokens are consumed both based on the
+ * number of requests made, as well as the bandwidth consumed (response sizes).
+ *
+ * GcsThrottle is thread safe and can be used from multiple threads.
+ */
+class GcsThrottle {
+ public:
+  /**
+   * Constructs a GcsThrottle.
+   */
+  explicit GcsThrottle(EnvTime* env_time = EnvTime::Default());
+
+  /**
+   * AdmitRequest updates the GcsThrottle to record a request will be made.
+   *
+   * AdmitRequest should be called before any request is made. AdmitRequest
+   * returns false if the request should be denied. If AdmitRequest
+   * returns false, no tokens are consumed. If true is returned, the configured
+   * number of tokens are consumed.
+   */
+  bool AdmitRequest();
+
+  /**
+   * RecordResponse updates the GcsThrottle to record a request has been made.
+   *
+   * RecordResponse should be called after the response has been received.
+   * RecordResponse will update the internal state based on the number of bytes
+   * in the response.
+   *
+   * Note: we split up the request and the response in this fashion in order to
+   * avoid penalizing consumers who are using large readahead buffers at higher
+   * layers of the I/O stack.
+   */
+  void RecordResponse(size_t num_bytes);
+
+  /**
+   * SetConfig sets the configuration for GcsThrottle and re-initializes state.
+   *
+   * After calling this, the token pool will be config.initial_tokens.
+   */
+  void SetConfig(GcsThrottleConfig config);
+
+  /**
+   * available_tokens gives a snapshot of how many tokens are available.
+   *
+   * The returned value should not be used to make admission decisions. The
+   * purpose of this function is to make available to monitoring or other
+   * instrumentation the number of available tokens in the pool.
+   */
+  inline int64 available_tokens() {
+    mutex_lock l(mu_);
+    if (!config_.enabled) return 0;
+    UpdateState();
+    return available_tokens_;
+  }
+
+ private:
+  /**
+   * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
+   *
+   * UpdateState should be called in order to mark the passage of time, and
+   * therefore add tokens to the availble_tokens_ pool.
+   */
+  void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  inline uint64 request_bytes_to_tokens(size_t num_bytes) {
+    return num_bytes >> 10;
+  }
+
+  mutex mu_;
+
+  /**
+   * last_updated_secs_ records the number of seconds since the Unix epoch that
+   * the internal state of the GcsThrottle was updated. This is important when
+   * determining the number of tokens to add to the available_tokens_ pool.
+   */
+  uint64 last_updated_secs_ GUARDED_BY(mu_) = 0;
+
+  /**
+   * available_tokens_ records how many tokens are available to be consumed.
+   *
+   * Note: it is possible for available_tokens_ to become negative. If a
+   * response comes back that consumes more than the available tokens, the count
+   * will go negative, and block future requests until we have available tokens.
+   */
+  int64 available_tokens_ GUARDED_BY(mu_) = 0;
+
+  EnvTime* const env_time_;
+  GcsThrottleConfig config_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
diff --git a/tensorflow/core/platform/cloud/gcs_throttle_test.cc b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..694756022e37263a07f8215bf7496c9ca130fd58
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+class TestTime : public EnvTime {
+ public:
+  uint64 NowMicros() override { return now_; }
+
+  void SetTime(uint64 now_micros) { now_ = now_micros; }
+
+  void AdvanceSeconds(int64 secs) { now_ += secs * 1000000L; }
+
+ private:
+  uint64 now_ = 1234567890000000ULL;
+};
+
+class GcsThrottleTest : public ::testing::Test {
+ protected:
+  GcsThrottleTest() : throttle_(&time_) {
+    config_.enabled = true;
+    throttle_.SetConfig(config_);
+  }
+
+  GcsThrottleConfig config_;
+  TestTime time_;
+  GcsThrottle throttle_;
+};
+
+TEST_F(GcsThrottleTest, ReplenishTokens) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(2);
+  EXPECT_EQ(300000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, RejectRequest) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  EXPECT_EQ(99900, throttle_.available_tokens());
+  for (int i = 1; i < 1000; i++) {
+    EXPECT_TRUE(throttle_.AdmitRequest());
+  }
+  EXPECT_FALSE(throttle_.AdmitRequest());
+}
+
+TEST_F(GcsThrottleTest, MarkResponses) {
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  throttle_.RecordResponse(128000000);  // 128 MB response
+  EXPECT_EQ(-25100, throttle_.available_tokens());
+  EXPECT_FALSE(throttle_.AdmitRequest());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest())
+      << "Available tokens: " << throttle_.available_tokens();
+}
+
+TEST_F(GcsThrottleTest, Skippingtime_) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(90);
+  EXPECT_EQ(9000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, BucketLimit) {
+  time_.AdvanceSeconds(120);
+  EXPECT_EQ(10000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, ReverseTime) {
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(-3600);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(200000, throttle_.available_tokens());
+}
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/http_request_fake.h b/tensorflow/core/platform/cloud/http_request_fake.h
index 682b97f6ec6d697bef2ef6301a39be35c95c5861..7711eaceb290fb21c54c9656c473d912ebbd84cf 100644
--- a/tensorflow/core/platform/cloud/http_request_fake.h
+++ b/tensorflow/core/platform/cloud/http_request_fake.h
@@ -38,8 +38,7 @@ class FakeHttpRequest : public CurlHttpRequest {
  public:
   /// Return the response for the given request.
   FakeHttpRequest(const string& request, const string& response)
-      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {
-  }
+      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {}
 
   /// Return the response with headers for the given request.
   FakeHttpRequest(const string& request, const string& response,
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 236259dbc16ffc806779bd100e1ec6ace2b7bb39..ad569758cc6ec11555a81a3bc7fbefbc580d6529 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -160,12 +160,12 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   ASSERT_EQ(1, EVP_DigestVerifyInit(md_ctx, nullptr, md, nullptr, key));
   ASSERT_EQ(1, EVP_DigestVerifyUpdate(md_ctx, header_dot_claim.c_str(),
                                       header_dot_claim.size()));
-  ASSERT_EQ(
-      1,
-      EVP_DigestVerifyFinal(
-          md_ctx, const_cast<unsigned char*>(
-                      reinterpret_cast<const unsigned char*>(signature.data())),
-          signature.size()));
+  ASSERT_EQ(1,
+            EVP_DigestVerifyFinal(
+                md_ctx,
+                const_cast<unsigned char*>(
+                    reinterpret_cast<const unsigned char*>(signature.data())),
+                signature.size()));
   EVP_MD_CTX_cleanup(md_ctx);
 
   // Free all the crypto-related resources.
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
index c3b6831361305f69e8a9882dbff90ce139ca13c0..be9ebe67b18e7be76e95149258cb1fcce6047d85 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system.cc
@@ -25,7 +25,6 @@ namespace tensorflow {
 
 namespace {
 
-
 class RetryingRandomAccessFile : public RandomAccessFile {
  public:
   RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
@@ -203,4 +202,6 @@ Status RetryingFileSystem::DeleteRecursively(const string& dirname,
       initial_delay_microseconds_);
 }
 
+void RetryingFileSystem::FlushCaches() { base_file_system_->FlushCaches(); }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index d9d8ea6b004c3cf1d0d77ff65fa415e746310afd..a262a5fd940f9b269721790c80caaef38d79d690 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -69,6 +69,8 @@ class RetryingFileSystem : public FileSystem {
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
 
+  void FlushCaches() override;
+
  private:
   std::unique_ptr<FileSystem> base_file_system_;
   const int64 initial_delay_microseconds_;
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 232dcb3e71aa7c5b05b45e37332fe58970fc3fe8..d3f763bb3c845436e8458135a0a754d8cb002957 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -84,7 +84,8 @@ class MockWritableFile : public WritableFile {
 
 class MockFileSystem : public FileSystem {
  public:
-  explicit MockFileSystem(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockFileSystem(const ExpectedCalls& calls, bool* flushed = nullptr)
+      : calls_(calls), flushed_(flushed) {}
 
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
@@ -156,11 +157,18 @@ class MockFileSystem : public FileSystem {
     return calls_.ConsumeNextCall("DeleteRecursively");
   }
 
+  void FlushCaches() override {
+    if (flushed_) {
+      *flushed_ = true;
+    }
+  }
+
   std::unique_ptr<WritableFile> writable_file_to_return;
   std::unique_ptr<RandomAccessFile> random_access_file_to_return;
 
  private:
   MockCallSequence calls_;
+  bool* flushed_ = nullptr;
 };
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
@@ -702,5 +710,14 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       << status;
 }
 
+TEST(RetryingFileSystemTest, FlushCaches) {
+  ExpectedCalls none;
+  bool flushed = false;
+  std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
+  RetryingFileSystem fs(std::move(base_fs), 0);
+  fs.FlushCaches();
+  EXPECT_TRUE(flushed);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b0d7b3a67ae9f92d8e321978a3b899c243c22d1d..7caf9d4db647b4bc8c5a19e1074aed6f98ca304e 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag;
 void InfoAboutUnusedCPUFeatures() {
   std::call_once(g_cpu_feature_guard_warn_once_flag, [] {
     string missing_instructions;
-#ifdef PLATFORM_WINDOWS
+#if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __AVX__
     CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions);
 #endif  // __AVX__
 #ifndef __AVX2__
     CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions);
 #endif  // __AVX2__
-#else   // ifdef platform windows
+
+#else   // if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __SSE__
     CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions);
 #endif  // __SSE__
@@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() {
 #ifndef __FMA__
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
-#endif  // else of ifdef platform windows
+#endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
index 639f6804ea236b86f458263091f371c1374e50ae..2d34239a9958d722a1cb84213657ca8229ebaf2c 100644
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc
@@ -27,8 +27,7 @@ TEST(CudaLibdevicePathTest, LibdevicePath) {
   VLOG(2) << "Libdevice root = " << LibdeviceRoot();
   std::vector<string> libdevice_files;
   TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
-      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"),
-      &libdevice_files));
+      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files));
   EXPECT_LT(0, libdevice_files.size());
 }
 #endif
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index f4b0f16393d70521386ad49fbf010591e5afb08c..8e60a7f0910ff9cf77a33f9d72d680ec42847777 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -579,8 +579,10 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
   // TODO(pbar) Handle device IDs and prefix properly.
   const string prefix = "";
   const int id = 0;
-  const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
-  const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
+  const string stream_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
+  const string memcpy_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 82bd69f9ca46eb1b8dd586d18ed852a2e8c5084e..2b874da1981bed396330ca3c526d82779046bdf2 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -83,15 +83,14 @@ void LogMessage::GenerateLogMessage() {
   const size_t time_buffer_size = 30;
   char time_buffer[time_buffer_size];
   strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S",
-	   localtime(&now_seconds));
+           localtime(&now_seconds));
 
   // TODO(jeff,sanjay): Replace this with something that logs through the env.
   fprintf(stderr, "%s.%06d: %c %s:%d] %s\n", time_buffer, micros_remainder,
-	  "IWEF"[severity_], fname_, line_, str().c_str());
+          "IWEF"[severity_], fname_, line_, str().c_str());
 }
 #endif
 
-
 namespace {
 
 // Parse log level (int64) from environment variable (char*)
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 40c260f236613e533e30dc006e77b02f393bdd48..f0efa31d5576393e9d9bba6e39a454b2a33cddc3 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -19,8 +19,8 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/logging.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/logging.h
 
-#include <sstream>
 #include <limits>
+#include <sstream>
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -205,16 +205,18 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
   inline string* name##Impl(int v1, int v2, const char* exprtext) {       \
     return name##Impl<int, int>(v1, v2, exprtext);                        \
   }                                                                       \
-  inline string* name##Impl(const size_t v1, const int v2, const char* exprtext) {       \
+  inline string* name##Impl(const size_t v1, const int v2,                \
+                            const char* exprtext) {                       \
     if (TF_PREDICT_FALSE(v2 < 0)) {                                       \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v1);                           \
     return name##Impl<size_t, size_t>(uval, v2, exprtext);                \
   }                                                                       \
-  inline string* name##Impl(const int v1, const size_t v2, const char* exprtext) {       \
-    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {      \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+  inline string* name##Impl(const int v1, const size_t v2,                \
+                            const char* exprtext) {                       \
+    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {        \
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v2);                           \
     return name##Impl<size_t, size_t>(v1, uval, exprtext);                \
@@ -225,12 +227,12 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
 // This happens if, for example, those are used as token names in a
 // yacc grammar.
 TF_DEFINE_CHECK_OP_IMPL(Check_EQ,
-                        == )  // Compilation error with CHECK_EQ(NULL, x)?
-TF_DEFINE_CHECK_OP_IMPL(Check_NE, != )  // Use CHECK(x == NULL) instead.
-TF_DEFINE_CHECK_OP_IMPL(Check_LE, <= )
-TF_DEFINE_CHECK_OP_IMPL(Check_LT, < )
-TF_DEFINE_CHECK_OP_IMPL(Check_GE, >= )
-TF_DEFINE_CHECK_OP_IMPL(Check_GT, > )
+                        ==)  // Compilation error with CHECK_EQ(NULL, x)?
+TF_DEFINE_CHECK_OP_IMPL(Check_NE, !=)  // Use CHECK(x == NULL) instead.
+TF_DEFINE_CHECK_OP_IMPL(Check_LE, <=)
+TF_DEFINE_CHECK_OP_IMPL(Check_LT, <)
+TF_DEFINE_CHECK_OP_IMPL(Check_GE, >=)
+TF_DEFINE_CHECK_OP_IMPL(Check_GT, >)
 #undef TF_DEFINE_CHECK_OP_IMPL
 
 // In optimized mode, use CheckOpString to hint to compiler that
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index f13b0af2a79bec4538c64cbc475681f6eb0ce127..e00dbdb4ae5ef682369b345353e236a6084460ef 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -41,8 +41,8 @@ namespace tensorflow {
 namespace port {
 
 ScopedFlushDenormal::ScopedFlushDenormal() {
-// For now, we flush denormals only on SSE 3.  Other architectures such as ARM
-// can be added as needed.
+  // For now, we flush denormals only on SSE 3.  Other architectures such as ARM
+  // can be added as needed.
 
 #ifdef DENORM_USE_INTRINSICS
   if (TestCPUFeature(SSE3)) {
diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
index c0c08dabacbcb9fdbbfd9bdbe16bcfaea7328507..89f14e905afa4e2c10055f59721fe4cabf082781 100644
--- a/tensorflow/core/platform/device_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -77,7 +77,8 @@ class DeviceTracerTest : public ::testing::Test {
 
     Node* y_neg = test::graph::Unary(&graph, "Neg", i);
     y_neg_ = y_neg->name();
-    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
+    y_neg->set_assigned_device_name(
+        "/job:localhost/replica:0/task:0/device:GPU:0");
 
     test::graph::ToGraphDef(&graph, &def_);
   }
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 557bfa87e50a85a6f9de86548931ea215d8ac7ff..34aaf3f78ba983de2ca84cd5281219a244cdbd72 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -286,7 +286,7 @@ class Env {
   // "version" should be the version of the library or NULL
   // returns the name that LoadLibrary() can use
   virtual string FormatLibraryFileName(const string& name,
-      const string& version) = 0;
+                                       const string& version) = 0;
 
  private:
   // Returns a possible list of local temporary directories.
@@ -353,6 +353,7 @@ class EnvWrapper : public Env {
                                const string& version) override {
     return target_->FormatLibraryFileName(name, version);
   }
+
  private:
   Env* target_;
 };
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 14755891fa2d3b916396c75c9647acafe66ec524..b9866cf641ac9126a3a7a3e9ecb2d3bc8f49ebc3 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -131,18 +131,19 @@ Status FileSystem::GetMatchingPaths(const string& pattern,
     if (children.empty()) continue;
     // This IsDirectory call can be expensive for some FS. Parallelizing it.
     children_dir_status.resize(children.size());
-    ForEach(0, children.size(), [this, &current_dir, &children, &fixed_prefix,
-                                 &children_dir_status](int i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // In case the child_path doesn't start with the fixed_prefix then
-      // we don't need to explore this path.
-      if (!StringPiece(child_path).starts_with(fixed_prefix)) {
-        children_dir_status[i] =
-            Status(tensorflow::error::CANCELLED, "Operation not needed");
-      } else {
-        children_dir_status[i] = IsDirectory(child_path);
-      }
-    });
+    ForEach(0, children.size(),
+            [this, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!StringPiece(child_path).starts_with(fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = IsDirectory(child_path);
+              }
+            });
     for (int i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
index 9c72d34ff518abcabf773af607589fe8114beebf..ab095a35c93517c6527b55bd922dbeb46d695ca4 100644
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/gif.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <gif_lib.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 0baeac09841073ad6013a4700646e82d5d97182f..74863293a32451e8881c93de468539b913169aaa 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -164,8 +164,9 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   } else {
     hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
   }
-  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is the build in
-  // environment variable of Kerberos, so KERB_TICKET_CACHE_PATH and related code are unnecessary.
+  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is
+  // the build in environment variable of Kerberos, so KERB_TICKET_CACHE_PATH
+  // and related code are unnecessary.
   char* ticket_cache_path = getenv("KERB_TICKET_CACHE_PATH");
   if (ticket_cache_path != nullptr) {
     hdfs_->hdfsBuilderSetKerbTicketCachePath(builder, ticket_cache_path);
diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h
index edbcbd960a7d61970119bfb385f075e1d3ffb96f..1b5e633f0aad09850afa82bee59d45c7943bbd8a 100644
--- a/tensorflow/core/platform/jpeg.h
+++ b/tensorflow/core/platform/jpeg.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/jpeg.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index 5b0203c343e6b1764a9cc8a7908919422d826bcb..dad18d72195953e78c6a169a19b9182ae6571485 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/png.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <png.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index cda6d7d8f9d6ad3e7f2c8fa56cc99a8dbe07fa00..2bb9443fb3c45e0cd4bb31a48539355747684b5f 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -73,19 +73,19 @@ error::Code ErrnoToCode(int err_number) {
     case ECHILD:      // No child processes
     case EISCONN:     // Socket is connected
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case ENOTBLK:     // Block device required
+    case ENOTBLK:  // Block device required
 #endif
-    case ENOTCONN:    // The socket is not connected
-    case EPIPE:       // Broken pipe
+    case ENOTCONN:  // The socket is not connected
+    case EPIPE:     // Broken pipe
 #if !defined(_WIN32)
-    case ESHUTDOWN:   // Cannot send after transport endpoint shutdown
+    case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
 #endif
-    case ETXTBSY:     // Text file busy
+    case ETXTBSY:  // Text file busy
       code = error::FAILED_PRECONDITION;
       break;
-    case ENOSPC:   // No space left on device
+    case ENOSPC:  // No space left on device
 #if !defined(_WIN32)
-    case EDQUOT:   // Disk quota exceeded
+    case EDQUOT:  // Disk quota exceeded
 #endif
     case EMFILE:   // Too many open files
     case EMLINK:   // Too many links
@@ -95,7 +95,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMEM:   // Not enough space
     case ENOSR:    // No STREAM resources
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EUSERS:   // Too many users
+    case EUSERS:  // Too many users
 #endif
       code = error::RESOURCE_EXHAUSTED;
       break;
@@ -104,17 +104,17 @@ error::Code ErrnoToCode(int err_number) {
     case ERANGE:     // Result too large
       code = error::OUT_OF_RANGE;
       break;
-    case ENOSYS:           // Function not implemented
-    case ENOTSUP:          // Operation not supported
-    case EAFNOSUPPORT:     // Address family not supported
+    case ENOSYS:        // Function not implemented
+    case ENOTSUP:       // Operation not supported
+    case EAFNOSUPPORT:  // Address family not supported
 #if !defined(_WIN32)
-    case EPFNOSUPPORT:     // Protocol family not supported
+    case EPFNOSUPPORT:  // Protocol family not supported
 #endif
     case EPROTONOSUPPORT:  // Protocol not supported
 #if !defined(_WIN32) && !defined(__HAIKU__)
     case ESOCKTNOSUPPORT:  // Socket type not supported
 #endif
-    case EXDEV:            // Improper link
+    case EXDEV:  // Improper link
       code = error::UNIMPLEMENTED;
       break;
     case EAGAIN:        // Resource temporarily unavailable
@@ -123,7 +123,7 @@ error::Code ErrnoToCode(int err_number) {
     case ECONNRESET:    // Connection reset
     case EINTR:         // Interrupted function call
 #if !defined(_WIN32)
-    case EHOSTDOWN:     // Host is down
+    case EHOSTDOWN:  // Host is down
 #endif
     case EHOSTUNREACH:  // Host is unreachable
     case ENETDOWN:      // Network is down
@@ -139,7 +139,7 @@ error::Code ErrnoToCode(int err_number) {
       break;
     case EDEADLK:  // Resource deadlock avoided
 #if !defined(_WIN32)
-    case ESTALE:   // Stale file handle
+    case ESTALE:  // Stale file handle
 #endif
       code = error::ABORTED;
       break;
@@ -158,7 +158,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMSG:       // No message of the desired type
     case EPROTO:       // Protocol error
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EREMOTE:      // Object is remote
+    case EREMOTE:  // Object is remote
 #endif
       code = error::UNKNOWN;
       break;
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 8604b01c53ef69040a919dadda73df897e98b0e1..ce2069b004473a684a1882068d3479ed049c58d6 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -58,8 +58,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(AndroidArmV7ACpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
         // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index d3362690d7e08c8e88e8168b62c8134b6af5a319..02de7d1362bbfca645d07ee72165283351944b9b 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -28,15 +28,17 @@ namespace profile_utils {
 
 static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
 
-#if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-   /* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
-     static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+/* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
+  static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #else
-   /* static */ int64 CpuUtils::GetCycleCounterFrequency() {
-     static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+/* static */ int64 CpuUtils::GetCycleCounterFrequency() {
+  static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #endif
 
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 5d215b4804dbee8cb785c99b09ec725101bacb4e..7b580c8bf606cdd9acf998fa21cb1d946e5e6ada 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -42,7 +42,7 @@ namespace profile_utils {
 class CpuUtils {
  public:
   // Constant for invalid frequency.
-  // This value is returned when the furequency is not obtained somehow.
+  // This value is returned when the frequency is not obtained somehow.
   static constexpr int64 INVALID_FREQUENCY = -1;
   static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
 
@@ -94,16 +94,18 @@ class CpuUtils {
 #endif
   }
 
-  // Return cycle counter frequency.
-  // As this method caches the cpu frequency internally,
-  // the first call will incur overhead, but not subsequent calls.
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     static uint64 GetCycleCounterFrequency();
-  #else
-     static int64 GetCycleCounterFrequency();
-  #endif
+// Return cycle counter frequency.
+// As this method caches the cpu frequency internally,
+// the first call will incur overhead, but not subsequent calls.
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  static uint64 GetCycleCounterFrequency();
+#else
+  static int64 GetCycleCounterFrequency();
+#endif
 
-  // Return micro secound per each clock
+  // Return micro second per each clock
   // As this method caches the cpu frequency internally,
   // the first call will incur overhead, but not subsequent calls.
   static double GetMicroSecPerClock();
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index 5b11b684dd9833bf742faaeaa3e79d2b49a78c6d..eb8161fbfd5ddfc796edd66a9119ad70c3c1de8e 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -53,15 +53,17 @@ TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
 }
 
 TEST_F(CpuUtilsTest, CheckCycleCounterFrequency) {
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
-  #else
-     const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
-  #endif
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
+#else
+  const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
+#endif
   if (DBG) {
     LOG(INFO) << "Cpu frequency = " << cpu_frequency;
   }
diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
index 51c54d50d1dadcf78e8263ce44b07c998b68c05c..11b739c0096b5b5fd498bb5c753a54c8b1628208 100644
--- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -47,7 +47,7 @@ class ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(ICpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__
diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h
index 7d6e8f57a62e08a7897bdccdeb7033363b282bd4..2f151a5aee6af067e4536bb569b4c0799c831b98 100644
--- a/tensorflow/core/platform/protobuf_internal.h
+++ b/tensorflow/core/platform/protobuf_internal.h
@@ -45,8 +45,8 @@ Status ParseAny(const google::protobuf::Any& any, T* message,
 #ifdef TENSORFLOW_LITE_PROTOS
   if (any.type_url() != strings::StrCat("type.googleapis.com/", type_name)) {
     return errors::FailedPrecondition(
-        "Expected Any type_url for: ", type_name, ". Got: ",
-        string(any.type_url().data(), any.type_url().size()), ".");
+        "Expected Any type_url for: ", type_name,
+        ". Got: ", string(any.type_url().data(), any.type_url().size()), ".");
   }
   if (!message->ParseFromString(any.value())) {
     return errors::FailedPrecondition("Failed to unpack: ",
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index fbca0acc36b01fa91dece4bdd0d19b7059dc114e..44317f1a3e41831b903bd0044d53d1eba80168df 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -96,7 +96,7 @@ Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
 
   return log_level;
 }
-}
+}  // namespace
 
 static bool initialized = false;
 static mutex s3_logging_mutex(LINKER_INITIALIZED);
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 2c0babe098f2e7a066338e5cb2a25aedf16db8d9..4862fd85be0b67ca6abea486885dadea2e96efeb 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <aws/core/utils/FileSystemUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/StringUtils.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
@@ -305,8 +306,15 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
     };
     Aws::InitAPI(options);
 
-    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(
-        new Aws::S3::S3Client(GetDefaultClientConfig()));
+    // The creation of S3Client disables virtual addressing:
+    //   S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true)
+    // The purpose is to address the issue encountered when there is an `.`
+    // in the bucket name. Due to TLS hostname validation or DNS rules,
+    // the bucket may not be resolved. Disabling of virtual addressing
+    // should address the issue. See GitHub issue 16397 for details.
+    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client(
+        GetDefaultClientConfig(),
+        Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false));
   }
 
   return this->s3_client_;
@@ -607,7 +615,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       Aws::String src_key = object.GetKey();
       Aws::String target_key = src_key;
       target_key.replace(0, src_object.length(), target_object.c_str());
-      Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key;
+      Aws::String source = Aws::String(src_bucket.c_str()) + "/"
+          + Aws::Utils::StringUtils::URLEncode(src_key.c_str());
 
       copyObjectRequest.SetBucket(target_bucket.c_str());
       copyObjectRequest.SetKey(target_key);
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 168b8007f3b60c60724682dd7fc4e95f8d15a413..8177e48dba52f11458faeb3092a12e6801f6b7ef 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -57,6 +57,18 @@ class S3FileSystem : public FileSystem {
   Status RenameFile(const string& src, const string& target) override;
  private:
   // Returns the member S3 client, initializing as-needed.
+  // When the client tries to access the object in S3, e.g.,
+  //   s3://bucket-name/path/to/object
+  // the behavior could be controlled by various environmental
+  // variables.
+  // By default S3 access regional endpoint, with region
+  // controlled by `AWS_REGION`. The endpoint could be overridden
+  // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default.
+  // If S3_USE_HTTPS=0 is specified, HTTP is used. Also,
+  // S3_VERIFY_SSL=0 could disable SSL verification in case
+  // HTTPS is used.
+  // This S3 Client does not support Virtual Hosted–Style Method
+  // for a bucket.
   std::shared_ptr<Aws::S3::S3Client> GetS3Client();
 
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 0c66da09bb9aa1c892063be11c66aedaf75d7eb6..592626bfa17e691d1b10ddce5c7f0f31ed825861 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
-
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h
index a6636225ccbbc8154e290cd7f1aa6cafe3d2027a..327237dba933230cb313dd06091d2ff2ca3cc4b2 100644
--- a/tensorflow/core/platform/test_benchmark.h
+++ b/tensorflow/core/platform/test_benchmark.h
@@ -60,7 +60,7 @@ class Benchmark {
  private:
   string name_;
   int num_args_;
-  std::vector<std::pair<int, int>> args_;
+  std::vector<std::pair<int, int> > args_;
   void (*fn0_)(int) = nullptr;
   void (*fn1_)(int, int) = nullptr;
   void (*fn2_)(int, int, int) = nullptr;
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index d6e78dbc8f9f25070d94141e46d35dcb8d727ef7..f20939d3c0ff02be30f19be170644fab44b6f45e 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -22,8 +22,10 @@ limitations under the License.
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
 #define __ORDER_LITTLE_ENDIAN__ 0x4d2
 #define __ORDER_BIG_ENDIAN__ 0x10e1
 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
 
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 788a4bf4b1af74393099d1b590a1e589d9a07f25..41b264417071cadb5f70806b458ee2b46ebb2feb 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #undef LoadLibrary
 #undef ERROR
 
+#include <string>
 #include <thread>
 #include <vector>
-#include <string>
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
@@ -53,8 +53,7 @@ class StdThread : public Thread {
 
 class WindowsEnv : public Env {
  public:
-  WindowsEnv()
-      : GetSystemTimePreciseAsFileTime_(NULL) {
+  WindowsEnv() : GetSystemTimePreciseAsFileTime_(NULL) {
     // GetSystemTimePreciseAsFileTime function is only available in the latest
     // versions of Windows. For that reason, we try to look it up in
     // kernel32.dll at runtime and use an alternative option if the function
@@ -72,8 +71,8 @@ class WindowsEnv : public Env {
   }
 
   bool MatchPath(const string& path, const string& pattern) override {
-      std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
-      std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
+    std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
+    std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
     return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
   }
 
@@ -122,14 +121,14 @@ class WindowsEnv : public Env {
     SetThreadpoolTimer(timer, &FileDueTime, 0, 0);
   }
 
-  Status LoadLibrary(const char *library_filename, void** handle) override {
+  Status LoadLibrary(const char* library_filename, void** handle) override {
     std::string file_name = library_filename;
     std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
     std::wstring ws_file_name(WindowsFileSystem::Utf8ToWideChar(file_name));
 
     HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL,
-      LOAD_WITH_ALTERED_SEARCH_PATH);
+                                     LOAD_WITH_ALTERED_SEARCH_PATH);
     if (!hModule) {
       return errors::NotFound(file_name + " not found");
     }
@@ -138,31 +137,30 @@ class WindowsEnv : public Env {
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-    void** symbol) override {
+                              void** symbol) override {
     FARPROC found_symbol;
 
     found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
     if (found_symbol == NULL) {
       return errors::NotFound(std::string(symbol_name) + " not found");
     }
-    *symbol = (void **)found_symbol;
+    *symbol = (void**)found_symbol;
     return Status::OK();
   }
 
-  string FormatLibraryFileName(const string& name, const string& version)
-    override {
+  string FormatLibraryFileName(const string& name,
+                               const string& version) override {
     string filename;
     if (version.size() == 0) {
       filename = name + ".dll";
-    }
-    else {
+    } else {
       filename = name + version + ".dll";
     }
     return filename;
   }
 
  private:
-  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
+  typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
diff --git a/tensorflow/core/platform/windows/error.cc b/tensorflow/core/platform/windows/error.cc
index 39e941a3834f7f7cd03e7791d43d56f190dc1fd6..291fc5003fb6bbc07274cdea72d73e92a453f363 100644
--- a/tensorflow/core/platform/windows/error.cc
+++ b/tensorflow/core/platform/windows/error.cc
@@ -21,7 +21,7 @@ namespace internal {
 std::string GetWindowsErrorMessage(DWORD err) {
   LPSTR buffer = NULL;
   DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-      FORMAT_MESSAGE_IGNORE_INSERTS;
+                FORMAT_MESSAGE_IGNORE_INSERTS;
   FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                  reinterpret_cast<LPSTR>(&buffer), 0, NULL);
   std::string message = buffer;
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
index 026e0d5aa946f7c851dacc05a3306631e06886aa..ba643a0fa8f92f58fbd88ac00fba3f663bb7e0f2 100644
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@@ -24,9 +24,7 @@ namespace tensorflow {
 namespace internal {
 
 std::string GetWindowsErrorMessage(DWORD err);
-
-}
 }
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
-
diff --git a/tensorflow/core/platform/windows/integral_types.h b/tensorflow/core/platform/windows/integral_types.h
index 4970b8ca6a1673dd24d2d445348fe5b337ae13be..46338a536dbc3541763e62954fee74b2a5a0700b 100644
--- a/tensorflow/core/platform/windows/integral_types.h
+++ b/tensorflow/core/platform/windows/integral_types.h
@@ -1,18 +1,18 @@
- /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- 
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
 
diff --git a/tensorflow/core/platform/windows/net.cc b/tensorflow/core/platform/windows/net.cc
index 46eb072d42592028859122a4cad3d9478a96476e..2ab558ab95cafd15b10f7b887c846b32ab7e4c47 100644
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #undef ERROR
 
-#pragma comment(lib,"Ws2_32.lib")
+#pragma comment(lib, "Ws2_32.lib")
 
 namespace tensorflow {
 namespace internal {
@@ -44,8 +44,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   CHECK_GE(*port, 0);
   CHECK_LE(*port, 65535);
   if (sock == INVALID_SOCKET) {
-    LOG(ERROR) << "socket() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "socket() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     return false;
   }
 
@@ -54,8 +54,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                           reinterpret_cast<const char*>(&one), sizeof(one));
   if (result == SOCKET_ERROR) {
-    LOG(ERROR) << "setsockopt() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "setsockopt() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -66,8 +66,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   addr.sin_port = htons((uint16_t)*port);
   result = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "bind(port=" << *port
+                 << ") failed: " << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -75,8 +75,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   // Get the bound port number.
   result = getsockname(sock, (struct sockaddr*)&addr, &addr_len);
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "getsockname() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "getsockname() failed: "
+                 << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index b65313363ed79ab327414179a9923ba2d436dd0b..66ec44885d52195b807f4957aec6d590324b2975 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -19,8 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 // SubProcess is not yet implemented for Windows.
-class SubProcess {
-};
+class SubProcess {};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc
index 0ffd02ff14849d77761e85c30388dc49a53c84db..584acad91b24fc6be9b93f71b7d44b0fba3cb2e8 100644
--- a/tensorflow/core/platform/windows/test.cc
+++ b/tensorflow/core/platform/windows/test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/net.h"
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 604348fe03a01d44195ba8a8ff427ae3ef3a4137..b6b3722caae4dc0cdc0ddff91be479ab91a744b2 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <Shlwapi.h>
 #include <Windows.h>
 #include <direct.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <io.h>
-#include <Shlwapi.h>
 #undef StrCat
 #include <stdio.h>
 #include <sys/stat.h>
@@ -75,16 +75,16 @@ SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) {
   if (TRUE == read_result) {
     result = bytes_read;
   } else if ((FALSE == read_result) &&
-      ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
+             ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
     result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1;
   } else {
-    if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result.
-      BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped,
-                                                     &bytes_read, TRUE);
+    if (ERROR_IO_PENDING ==
+        last_error) {  // Otherwise bytes_read already has the result.
+      BOOL overlapped_result =
+          ::GetOverlappedResult(hfile, &overlapped, &bytes_read, TRUE);
       if (FALSE == overlapped_result) {
         result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1;
-      }
-      else {
+      } else {
         result = bytes_read;
       }
     }
@@ -151,11 +151,11 @@ class WindowsWritableFile : public WritableFile {
   Status Append(const StringPiece& data) override {
     DWORD bytes_written = 0;
     DWORD data_size = static_cast<DWORD>(data.size());
-    BOOL write_result = ::WriteFile(hfile_, data.data(), data_size,
-                                    &bytes_written, NULL);
+    BOOL write_result =
+        ::WriteFile(hfile_, data.data(), data_size, &bytes_written, NULL);
     if (FALSE == write_result) {
-      return IOErrorFromWindowsError(
-          "Failed to WriteFile: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("Failed to WriteFile: " + filename_,
+                                     ::GetLastError());
     }
 
     assert(size_t(bytes_written) == data.size());
@@ -171,8 +171,8 @@ class WindowsWritableFile : public WritableFile {
     }
 
     if (FALSE == ::CloseHandle(hfile_)) {
-      return IOErrorFromWindowsError(
-          "CloseHandle failed for: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("CloseHandle failed for: " + filename_,
+                                     ::GetLastError());
     }
 
     hfile_ = INVALID_HANDLE_VALUE;
@@ -187,9 +187,7 @@ class WindowsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status Sync() override {
-    return Flush();
-  }
+  Status Sync() override { return Flush(); }
 };
 
 class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -204,7 +202,10 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
   WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile,
                           HANDLE hmap, const void* address, uint64 length)
-      : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address),
+      : filename_(filename),
+        hfile_(hfile),
+        hmap_(hmap),
+        address_(address),
         length_(length) {}
 
   ~WinReadOnlyMemoryRegion() {
@@ -238,9 +239,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
 
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "NewRandomAccessFile failed to Create/Open: " + fname;
@@ -258,9 +259,9 @@ Status WindowsFileSystem::NewWritableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, CREATE_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewWriteableFile: " + fname;
@@ -278,9 +279,9 @@ Status WindowsFileSystem::NewAppendableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, OPEN_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewAppendableFile: " + fname;
@@ -316,9 +317,9 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   file_flags |= FILE_FLAG_OVERLAPPED;
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     return IOErrorFromWindowsError(
@@ -345,28 +346,32 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
                                        NULL);  // Mapping name
 
     if (!hmap) {
-      string context = "Failed to create file mapping for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to create file mapping for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
     UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc);
 
-    const void* mapped_region = ::MapViewOfFileEx(
-        hmap, FILE_MAP_READ,
-        0,  // High DWORD of access start
-        0,  // Low DWORD
-        file_size,
-        NULL);  // Let the OS choose the mapping
+    const void* mapped_region =
+        ::MapViewOfFileEx(hmap, FILE_MAP_READ,
+                          0,  // High DWORD of access start
+                          0,  // Low DWORD
+                          file_size,
+                          NULL);  // Let the OS choose the mapping
 
     if (!mapped_region) {
-      string context = "Failed to MapViewOfFile for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to MapViewOfFile for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
-    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap,
-                                              mapped_region, file_size));
+    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap, mapped_region,
+                                              file_size));
 
     map_guard.release();
     file_guard.release();
@@ -404,8 +409,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
   }
 
   do {
-	string file_name = WideCharToUtf8(find_data.cFileName);
-	const StringPiece basename = file_name;
+    string file_name = WideCharToUtf8(find_data.cFileName);
+    const StringPiece basename = file_name;
     if (basename != "." && basename != "..") {
       result->push_back(file_name);
     }
@@ -457,8 +462,7 @@ Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
     file_size.HighPart = attrs.nFileSizeHigh;
     file_size.LowPart = attrs.nFileSizeLow;
     *size = file_size.QuadPart;
-  }
-  else {
+  } else {
     string context = "Can not get size for: " + fname;
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
@@ -472,7 +476,7 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
   std::wstring ws_translated_src = Utf8ToWideChar(TranslateName(src));
   std::wstring ws_translated_target = Utf8ToWideChar(TranslateName(target));
   if (!::MoveFileExW(ws_translated_src.c_str(), ws_translated_target.c_str(),
-      MOVEFILE_REPLACE_EXISTING)) {
+                     MOVEFILE_REPLACE_EXISTING)) {
     string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 8dcc1530370f0615ec45785a1f3d10ce828d11a3..ba0302f0fd8b56dabaf9271a725bebdac4716102 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -63,33 +63,35 @@ class WindowsFileSystem : public FileSystem {
 
   Status RenameFile(const string& src, const string& target) override;
 
-  string TranslateName(const string& name) const override {
-    return name;
-  }
+  string TranslateName(const string& name) const override { return name; }
 
   static std::wstring Utf8ToWideChar(const string& utf8str) {
-      int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), NULL, 0);
-      std::wstring ws_translated_str(size_required, 0);
-      MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), &ws_translated_str[0], size_required);
-      return ws_translated_str;
+    int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(),
+                                            (int)utf8str.size(), NULL, 0);
+    std::wstring ws_translated_str(size_required, 0);
+    MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(),
+                        &ws_translated_str[0], size_required);
+    return ws_translated_str;
   }
 
-  static string WideCharToUtf8(const std::wstring &wstr) {
-      if (wstr.empty()) return std::string();
-      int size_required = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
-      string utf8_translated_str(size_required, 0);
-      WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), &utf8_translated_str[0], size_required, NULL, NULL);
-      return utf8_translated_str;
+  static string WideCharToUtf8(const std::wstring& wstr) {
+    if (wstr.empty()) return std::string();
+    int size_required = WideCharToMultiByte(
+        CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
+    string utf8_translated_str(size_required, 0);
+    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(),
+                        &utf8_translated_str[0], size_required, NULL, NULL);
+    return utf8_translated_str;
   }
 };
 
 class LocalWinFileSystem : public WindowsFileSystem {
-public:
-    string TranslateName(const string& name) const override {
-      StringPiece scheme, host, path;
-      io::ParseURI(name, &scheme, &host, &path);
-      return path.ToString();
-    }
+ public:
+  string TranslateName(const string& name) const override {
+    StringPiece scheme, host, path;
+    io::ParseURI(name, &scheme, &host, &path);
+    return path.ToString();
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 9e628b10651423a7ce05392e675453c87f8b6c8c..57d76eb4cb9382790c80a0d55ee94b64e7b9dcdc 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 # can also generate memory profile using `-select bytes`
 tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename>  -trim_name_regexes .*apply_op.*
 
-# Use pprof to visualize the generated file.
-pprof -png --nodecount=100 --sample_index=1 <filename>
+# Use google-pprof, from the google-perftools package to visualize the generated file.
+# On Ubuntu you can install it with `apt-get install it google-perftools`.
+google-pprof --pdf --nodecount=100 <filename>
 ```
 
 ![PprofGraph](g3doc/pprof.jpg)
@@ -256,7 +257,7 @@ bug fix. `OpLogProto` is a good plus if it is used.
 
 #### Teams
 
-* Xin Pan (xpan@google.com, github: panyx0718)
+* Xin Pan
 * Chris Antaki
 * Yao Zhang
 * Jon Shlens
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index d05143aff9b8cc0b9a0e9af9445ba79345e4bf62..e968b9c97e28eeae22954102d5f0e07e09d75f7f 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -53,10 +53,13 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0",
                       node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+                      node_stat);
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
                       node_stat);
     return node;
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index 5a8429d4893effc8bbfa0bf69e18b4a182e9a5df..3dce1d85db35436d162e73bf0946b320b899d5eb 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -113,8 +113,9 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     root_->formatted_str = FormatNode(root_.get(), root_.get(), opts);
   }
   if (timeline) {
-    fprintf(stderr, "op view doesn't support timeline yet. "
-                    "Consider graph/scope/code view.\n");
+    fprintf(stderr,
+            "op view doesn't support timeline yet. "
+            "Consider graph/scope/code view.\n");
     return root_.get();
   }
   if (cnodes_map_.empty()) {
@@ -265,9 +266,9 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_parameters() > 0) {
       accu_pct = 100.0 * node->proto().total_parameters() /
-          root->proto().total_parameters();
-      pct = 100.0 * node->proto().parameters() /
-          root->proto().total_parameters();
+                 root->proto().total_parameters();
+      pct =
+          100.0 * node->proto().parameters() / root->proto().total_parameters();
     }
     attrs.push_back(strings::Printf(
         "%30s",
@@ -282,9 +283,8 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_float_ops() > 0) {
       accu_pct = 100.0 * node->proto().total_float_ops() /
-          root->proto().total_float_ops();
-      pct = 100.0 * node->proto().float_ops() /
-          root->proto().total_float_ops();
+                 root->proto().total_float_ops();
+      pct = 100.0 * node->proto().float_ops() / root->proto().total_float_ops();
     }
 
     attrs.push_back(strings::Printf(
diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
index fe1c3b2ae826783c1405b6151b82f153c05d2901..aa22182d36cac8d7e1f9fb3143beadfdfe0efce6 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -41,8 +41,7 @@ namespace tfprof {
 // to input ops.
 class TFOp : public TFMultiShow {
  public:
-  explicit TFOp()
-      : TFMultiShow() {}
+  explicit TFOp() : TFMultiShow() {}
   ~TFOp() override {}
 
   void AddNode(TFGraphNode* node) override;
@@ -51,7 +50,7 @@ class TFOp : public TFMultiShow {
 
  private:
   const ShowMultiNode* ShowInternal(const Options& opts,
-                                   Timeline* timeline) override;
+                                    Timeline* timeline) override;
 
   int64 SearchRoot(const std::vector<OpNode*> nodes,
                    const std::vector<string>& regexes);
diff --git a/tensorflow/core/profiler/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h
index 4d6de060705435c5346f6f49810b7dfc05d4530e..81b021549a49625cd5ba4a6ba8130f12cc7cf5f7 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_show.h
@@ -78,40 +78,43 @@ class TFShow {
       return nodes;
     }
     std::vector<T*> sorted_nodes = nodes;
-    std::sort(sorted_nodes.begin(), sorted_nodes.end(), [&opts](const T* n1,
-                                                                const T* n2) {
-      if (n1->name() == kTFProfRoot) return true;
-      if (n2->name() == kTFProfRoot) return false;
-      bool name_cmp = n1->name() < n2->name();
-      if (opts.order_by == kOrderBy[0]) {
-        return name_cmp;
-      } else if (opts.order_by == kOrderBy[1]) {
-        return n1->proto().total_requested_bytes() >
-               n2->proto().total_requested_bytes();
-      } else if (opts.order_by == kOrderBy[2]) {
-        return n1->proto().total_peak_bytes() > n2->proto().total_peak_bytes();
-      } else if (opts.order_by == kOrderBy[3]) {
-        return n1->proto().total_residual_bytes() >
-               n2->proto().total_residual_bytes();
-      } else if (opts.order_by == kOrderBy[4]) {
-        return n1->proto().total_output_bytes() >
-               n2->proto().total_output_bytes();
-      } else if (opts.order_by == kOrderBy[5]) {
-        return n1->proto().total_exec_micros() >
-               n2->proto().total_exec_micros();
-      } else if (opts.order_by == kOrderBy[6]) {
-        return n1->proto().total_accelerator_exec_micros() >
-               n2->proto().total_accelerator_exec_micros();
-      } else if (opts.order_by == kOrderBy[7]) {
-        return n1->proto().total_cpu_exec_micros() >
-               n2->proto().total_cpu_exec_micros();
-      } else if (opts.order_by == kOrderBy[8]) {
-        return n1->proto().total_parameters() > n2->proto().total_parameters();
-      } else if (opts.order_by == kOrderBy[9]) {
-        return n1->proto().total_float_ops() > n2->proto().total_float_ops();
-      }
-      return name_cmp;
-    });
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
+              [&opts](const T* n1, const T* n2) {
+                if (n1->name() == kTFProfRoot) return true;
+                if (n2->name() == kTFProfRoot) return false;
+                bool name_cmp = n1->name() < n2->name();
+                if (opts.order_by == kOrderBy[0]) {
+                  return name_cmp;
+                } else if (opts.order_by == kOrderBy[1]) {
+                  return n1->proto().total_requested_bytes() >
+                         n2->proto().total_requested_bytes();
+                } else if (opts.order_by == kOrderBy[2]) {
+                  return n1->proto().total_peak_bytes() >
+                         n2->proto().total_peak_bytes();
+                } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_residual_bytes() >
+                         n2->proto().total_residual_bytes();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_output_bytes() >
+                         n2->proto().total_output_bytes();
+                } else if (opts.order_by == kOrderBy[5]) {
+                  return n1->proto().total_exec_micros() >
+                         n2->proto().total_exec_micros();
+                } else if (opts.order_by == kOrderBy[6]) {
+                  return n1->proto().total_accelerator_exec_micros() >
+                         n2->proto().total_accelerator_exec_micros();
+                } else if (opts.order_by == kOrderBy[7]) {
+                  return n1->proto().total_cpu_exec_micros() >
+                         n2->proto().total_cpu_exec_micros();
+                } else if (opts.order_by == kOrderBy[8]) {
+                  return n1->proto().total_parameters() >
+                         n2->proto().total_parameters();
+                } else if (opts.order_by == kOrderBy[9]) {
+                  return n1->proto().total_float_ops() >
+                         n2->proto().total_float_ops();
+                }
+                return name_cmp;
+              });
     return sorted_nodes;
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h
index 2a2208d8e78efd5bc20d0db23e5fdaabbb3e8d5a..711d35f9753cf85f7f318a9ac3de40d6d2bf786e 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.h
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -50,7 +50,7 @@ class TFMultiShow {
 
  protected:
   virtual const ShowMultiNode* ShowInternal(const Options& opts,
-                                           Timeline* timeline) = 0;
+                                            Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 4428ab571f84ff75499f24d78af2547d512a8c1c..baf3fb2bedb13e13b21940485ec439c19a97dd02 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -178,7 +178,6 @@ class Timeline {
   int64 step_;
   const string outfile_;
   int64 next_pid_ = 0;
-  int64 allocator_pid_ = -1;
   MemoryTracker mem_tracker_;
   ChromeTraceFormatter chrome_formatter_;
   std::map<string, int64> device_pids_;
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 2813bb46fa44bc1ed04e7e8f5cd02737a81abad4..7712ebd926f1df2d65b7f7d732b55846654ed218 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -355,9 +355,6 @@ static const char* const kOpTypes =
 static const char* const kScope =
     "scope: The nodes in the model graph are organized by their names, which "
     "is hierarchical like filesystem.";
-static const char* const kGraph =
-    "graph: The nodes in the model graph are organized by their operation "
-    "input and output.";
 static const char* const kCode =
     "code: When python trace is available, the nodes are python lines and "
     "their are organized by the python call stack.";
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 2cc212d5898c15c0d066a477068f7c68fa244b54..808e3c853bec0efb9523ee413f3d5272a833358d 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -206,8 +206,12 @@ int Run(int argc, char** argv) {
         "graph_path,op_log_path,run_meta_path\n");
     std::unique_ptr<GraphDef> graph(new GraphDef());
     if (!FLAGS_graph_path.empty()) {
-      TF_CHECK_OK(
-          ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
+      s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to read graph_path: %s\n",
+                s.ToString().c_str());
+        return 1;
+      }
     }
 
     std::unique_ptr<OpLogProto> op_log(new OpLogProto());
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 67da7bf4526235ae51eb172f8da9fc267cc12b98..50bfa9126789033c617e22f25dbb76273fccfc60 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 5
+#define TF_MINOR_VERSION 6
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc
index 1eab7e3d024c181f260500686b9127dd76dbe206..3a5f1f83af8d2d2324f3139568aa69f204cf1248 100644
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@@ -69,9 +69,9 @@ BCast::BCast(const Vec& sx, const Vec& sy, const bool fewer_dims_optimization) {
       State curr = UNKNOWN;
       const int64 x_i = x[i];  // i-th dimension of x.
       const int64 y_i = y[i];  // i-th dimension of y.
-      int64 o_i;   // i-th dimension of the output.
-      int64 bx_i;  // i-th broadcast for x.
-      int64 by_i;  // i-th broadcast for y.
+      int64 o_i;               // i-th dimension of the output.
+      int64 bx_i;              // i-th broadcast for x.
+      int64 by_i;              // i-th broadcast for y.
       // Invariant:
       //   o_i = x_i * bx_i = y_i * by_i
       if (x_i == y_i) {
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index be00895b0d3517fe06a852685f79f32e5a0b5167..dd1163310bf406b66bdd450ac6bf840272f7c592 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -130,13 +130,13 @@ Status CTCLossCalculator::CalculateLoss(
   for (int t = 1; t < num_time_steps; ++t) {
     if (inputs[t].rows() != batch_size) {
       return errors::InvalidArgument("Expected batch size at t: ", t,
-                                     " to be: ", batch_size, " but got: ",
-                                     inputs[t].rows());
+                                     " to be: ", batch_size,
+                                     " but got: ", inputs[t].rows());
     }
     if (inputs[t].cols() != num_classes) {
       return errors::InvalidArgument("Expected class count at t: ", t,
-                                     " to be: ", num_classes, " but got: ",
-                                     inputs[t].cols());
+                                     " to be: ", num_classes,
+                                     " but got: ", inputs[t].cols());
     }
   }
 
@@ -282,8 +282,8 @@ Status CTCLossCalculator::PopulateLPrimes(
     LabelSequences* l_primes) const {
   // labels is a Label array of size batch_size
   if (labels.size() != batch_size) {
-    return errors::InvalidArgument("labels.size() != batch_size: ",
-                                   labels.size(), " vs. ", batch_size);
+    return errors::InvalidArgument(
+        "labels.size() != batch_size: ", labels.size(), " vs. ", batch_size);
   }
 
   *max_u_prime = 0;  // keep track of longest l' modified label sequence.
@@ -325,12 +325,13 @@ Status CTCLossCalculator::PopulateLPrimes(
     for (int l_i : l) {
       if (l_i < 0) {
         return errors::InvalidArgument(
-            "All labels must be nonnegative integers, batch: ", b, " labels: ",
-            str_util::Join(l, ","));
+            "All labels must be nonnegative integers, batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       } else if (l_i >= num_classes) {
         return errors::InvalidArgument(
-            "No label may be greater than num_classes. ", "num_classes: ",
-            num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ","));
+            "No label may be greater than num_classes. ",
+            "num_classes: ", num_classes, ", batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       }
     }
     if (!ignore_longer_outputs_than_inputs) {
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..f787687f6628797ce9c7d21f65fb6fd983710bb6
--- /dev/null
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -0,0 +1,499 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
+#define TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
+
+/**
+ * Wrappers and helpers for CUDA device code.
+ *
+ * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
+ * backwards compatibility, see go/volta-porting for details.
+ * Provides atomic operations on types that aren't natively supported.
+ */
+
+#if GOOGLE_CUDA
+
+#include <algorithm>
+#include <complex>
+#include "cuda/include/cuda.h"
+#include "cuda/include/device_functions.h"
+#include "tensorflow/core/platform/types.h"
+
+#if CUDA_VERSION >= 7050
+#include "cuda/include/cuda_fp16.h"
+#endif  // CUDA_VERSION >= 7050
+
+namespace tensorflow {
+
+namespace detail {
+
+// Helper for range-based for loop using 'delta' increments.
+// Usage: see CudaGridRange?() functions below.
+template <typename T>
+class CudaGridRange {
+  struct Iterator {
+    __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+    __device__ T operator*() const { return index_; }
+    __device__ Iterator& operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __device__ bool operator!=(const Iterator& other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      // Anything past an end iterator (delta_ == 0) is equal.
+      // In range-based for loops, this optimizes to 'return less'.
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    T index_;
+    const T delta_;
+  };
+
+ public:
+  __device__ CudaGridRange(T begin, T delta, T end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  T begin_;
+  T delta_;
+  T end_;
+};
+
+}  // namespace detail
+
+// Helper to visit indices in the range 0 <= i < count, using the x-coordinate
+// of the global thread index. That is, each index i is visited by all threads
+// with the same x-coordinate.
+// Usage: for(int i : CudaGridRangeX(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeX(T count) {
+  return detail::CudaGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
+                                  gridDim.x * blockDim.x, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : CudaGridRangeY(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeY(T count) {
+  return detail::CudaGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
+                                  gridDim.y * blockDim.y, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : CudaGridRangeZ(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeZ(T count) {
+  return detail::CudaGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
+                                  gridDim.z * blockDim.z, count);
+}
+
+// Mask for all 32 threads in a warp.
+const unsigned kCudaWarpAll = 0xffffffff;
+
+// Returns the warp lane ID of the calling thread
+__device__ inline unsigned CudaLaneId() {
+  unsigned int lane_id;
+  asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+  return lane_id;
+}
+
+namespace detail {
+// Returns true if mask is a valid parameter for __shfl*sync to return a well
+// defined value, assuming the calling lane will read from src_lane as part of
+// the shuffle operation.
+//
+// Specifically, returns true iff mask has the calling lane bit and the src_lane
+// bit set, and the src_lane calls this function with the same mask value
+// (required for the two threads to wait for each other).
+//
+// On Volta, for some invalid masks, this function hangs or returns false
+// positives, because the implementation shuffles with the same mask that
+// we are validating. Run on Pascal if you suspect that the mask is incorrect.
+__device__ inline bool CudaValidateShuffleSyncMask(unsigned mask,
+                                                   unsigned src_lane) {
+  unsigned src_dst_mask = 1u << CudaLaneId() | 1u << src_lane;
+#if CUDA_VERSION >= 9000
+  unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
+#else
+  unsigned src_lane_mask = __shfl(mask, src_lane);
+#endif
+  return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
+}
+
+// Returns the actual source lane for shuffle.
+__device__ inline unsigned CudaShuffleGetSrcLane(int src_lane, int width) {
+  int lane_id = CudaLaneId();
+  int lane_base = lane_id & ~width + 1;
+  int lane_offset = src_lane & width - 1;
+  return lane_base + lane_offset;
+}
+
+// Returns the source lane for shuffle up.
+__device__ inline unsigned CudaShuffleUpGetSrcLane(unsigned delta, int width) {
+  unsigned lane_id = CudaLaneId();
+  if ((lane_id & width - 1) < delta) {
+    return lane_id;
+  }
+  return lane_id - delta;
+}
+
+// Returns the source lane for shuffle down.
+__device__ inline unsigned CudaShuffleDownGetSrcLane(unsigned delta,
+                                                     int width) {
+  unsigned lane_id = CudaLaneId();
+  if ((lane_id & width - 1) + delta >= width) {
+    return lane_id;
+  }
+  return lane_id + delta;
+}
+
+// Returns the source lane for shuffle xor.
+__device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) {
+  int lane_id = CudaLaneId();
+  int src_lane = lane_id ^ lane_mask;
+  if (src_lane > (lane_id | width - 1)) {
+    return lane_id;
+  }
+  return src_lane;
+}
+}  // namespace detail
+
+// For all *_sync wrappers below, it is illegal to synchronize threads from
+// different program locations, because that is not supported before sm_70.
+// In other words, all threads in 'mask' must call the functions in convergence.
+// Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
+//
+// It is also illegal to shuffle with a mask that produces an undefined result
+// for any of the threads. Specifically, all source threads of the shuffle
+// must have their corresponding bit in 'mask' set.
+
+// Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
+__device__ inline void CudaSyncWarp(unsigned mask = kCudaWarpAll) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  __syncwarp(mask);
+#endif
+}
+
+// Wrapper for __ballot_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline unsigned CudaBallotSync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __ballot_sync(mask, pred);
+#else
+  return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
+#endif
+}
+
+// Wrapper for __any_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int CudaAnySync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __any_sync(mask, pred);
+#else
+  return __any(pred);
+#endif
+}
+
+// Wrapper for __all_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int CudaAllSync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __all_sync(mask, pred);
+#else
+  return __all(pred);
+#endif
+}
+
+// Wrapper for __shfl_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T CudaShuffleSync(unsigned mask, T value, int src_lane,
+                             int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleGetSrcLane(src_lane, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(mask, value, src_lane, width);
+#else
+  return __shfl(value, src_lane, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleSync(unsigned mask, double value,
+                                         int src_lane, int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleSync(mask, hi, src_lane, width);
+  lo = CudaShuffleSync(mask, lo, src_lane, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta,
+                                      int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleUpGetSrcLane(delta, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_up_sync(mask, value, delta, width);
+#else
+  return __shfl_up(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleUpSync(unsigned mask, double value,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleUpSync(mask, hi, delta, width);
+  lo = CudaShuffleUpSync(mask, lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
+// in convergence, see comment above for details.
+template <typename T>
+__device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta,
+                                        int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleDownGetSrcLane(delta, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(mask, value, delta, width);
+#else
+  return __shfl_down(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleDownSync(unsigned mask, double value,
+                                             unsigned delta,
+                                             int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleDownSync(mask, hi, delta, width);
+  lo = CudaShuffleDownSync(mask, lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask,
+                                int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, lane_mask, width);
+#else
+  return __shfl_xor(value, lane_mask, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleXorSync(unsigned mask, double value,
+                                            int lane_mask,
+                                            int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleXorSync(mask, hi, lane_mask, width);
+  lo = CudaShuffleXorSync(mask, lo, lane_mask, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __ldg.
+template <typename T>
+__host__ __device__ T CudaLdg(const T* address) {
+#if __CUDA_ARCH__ >= 350
+  return __ldg(address);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline bool CudaLdg(const bool* address) {
+  return CudaLdg(reinterpret_cast<const char*>(address)) != 0;
+}
+
+__host__ __device__ inline std::complex<float> CudaLdg(
+    const std::complex<float>* address) {
+#if __CUDA_ARCH__ >= 350
+  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
+  return std::complex<float>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline std::complex<double> CudaLdg(
+    const std::complex<double>* address) {
+#if __CUDA_ARCH__ >= 350
+  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
+  return std::complex<double>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+// Zeroes count elements starting at ptr using all threads of a 1-D grid.
+// Note: this function does not synchronize, and therefore the memory range is
+// not guaranteed to be zero until the next kernel launch.
+template <typename T>
+__global__ void SetZero(const int count, T* ptr) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1 && blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : CudaGridRangeX(count)) {
+    ptr[i] = T(0);
+  }
+}
+
+namespace detail {
+// Helper function for atomic accumulation implemented as CAS.
+template <typename T, typename F>
+__device__ T CudaAtomicCasHelper(T* ptr, F accumulate) {
+  T old = *ptr;
+  T assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(ptr, assumed, accumulate(assumed));
+  } while (assumed != old);
+  return old;
+}
+
+// Overload for floating point (using integer comparison to handle NaN
+// correctly).
+template <typename F>
+__device__ float CudaAtomicCasHelper(float* ptr, F accumulate) {
+  return __float_as_int(
+      CudaAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
+        return __float_as_int(accumulate(__int_as_float(a)));
+      }));
+}
+template <typename F>
+__device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
+  return __longlong_as_double(CudaAtomicCasHelper(
+      reinterpret_cast<tensorflow::uint64*>(ptr),
+      [accumulate](tensorflow::uint64 a) {
+        return __double_as_longlong(accumulate(__longlong_as_double(a)));
+      }));
+}
+
+template <typename From, typename To>
+using ToTypeIfConvertible =
+    typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
+
+}  // namespace detail
+
+// CUDA provides atomic ops, but not for all types.  We provide wrappers
+// for some ops and provide implementation for all reasonable types.
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
+  return atomicAdd(ptr, value);
+}
+#if __CUDA_ARCH__ < 600
+__device__ inline double CudaAtomicAdd(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(ptr,
+                                     [value](double a) { return a + value; });
+}
+#elif __clang__
+// Clang cannot compile __nvvm_atom_add_gen_d builtin yet, use inline PTX.
+// see https://reviews.llvm.org/D39638
+__device__ inline double CudaAtomicAdd(double* ptr, double value) {
+  double result;
+  asm volatile("atom.add.f64 %0, [%1], %2;"
+               : "=d"(result)
+               : "l"(ptr), "d"(value)
+               : "memory");
+  return result;
+}
+#endif
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
+  return atomicSub(ptr, value);
+}
+// Specializations of substraction which add the negative value.
+__device__ inline float CudaAtomicSub(float* ptr, float value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+__device__ inline double CudaAtomicSub(double* ptr, double value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+__device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
+  return atomicMax(ptr, value);
+}
+#if __CUDA_ARCH__ < 320
+__device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
+}
+#endif
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
+  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
+}
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
+  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 3e32ec79731e1529affb49cf6e1aff3f23b84262..18a4c008f138bb4ba3b1e4c381781e0c363863f7 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -18,299 +18,133 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include <algorithm>
+#include "tensorflow/core/util/cuda_device_functions.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "cuda/include/cuda.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/platform/types.h"
+// Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (int i : ::tensorflow::CudaGridRangeX<int>(n))
+// Deprecated, use 'for(int i : CudaGridRange?(n))' instead.
+#define CUDA_AXIS_KERNEL_LOOP(i, n, axis) \
+  for (int i : ::tensorflow::CudaGridRange##axis<int>(n))
 
-// Mask for all 32 threads in a warp.
-#define CUDA_WARP_ALL 0xFFFFFFFF
-
-#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
-// CUDA 9.0 introduces a new, light-weight barrier synchronization primitive
-// that operates at the warp-scope. This is required to ensure visibility of
-// reads/writes among threads that can make indepenent progress on Volta.
-// For previous CUDA versions these synchronizations not necessary, and we
-// define an empty function as a convenience for backward compatibility.
-__device__ inline void __syncwarp(unsigned mask = CUDA_WARP_ALL) {}
-
-// CUDA 9.0 deprecates the warp-intrinsic functions (shfl, ballot, etc.) in
-// favor of synchronizing versions. These ensure that all warp lanes specified
-// in mask execute the intrinsic in convergence. Here we provide legacy mappings
-// to the less-verbose routines provided in previous versions of CUDA.
-#define __ballot_sync(mask, predicate) __ballot(predicate)
-#define __shfl_sync(mask, val, srcLane, width) __shfl(val, srcLane, width)
-#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
-#define __shfl_up_sync(mask, val, delta, width) __shfl_up(val, delta, width)
-#define __shfl_xor_sync(mask, val, laneMask, width) \
-  __shfl_xor(val, laneMask, width)
-#endif
-
-// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
-// GetCuda3DLaunchConfig:
-//
-// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
-// version uses heuristics without any knowledge of the device kernel, the other
-// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
-// launch parameters that maximize occupancy. Currently, only the maximum
-// occupancy version of GetCuda3DLaunchConfig is available.
-//
-// For large number of work elements, the convention is that each kernel would
-// iterate through its assigned range. The return value of GetCudaLaunchConfig
-// is struct CudaLaunchConfig, which contains all the information needed for the
-// kernel launch, including: virtual number of threads, the number of threads
-// per block and number of threads per block used inside <<< >>> of a kernel
-// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
-// as CudaLaunchConfig. The only difference is the dimension. The macros
-// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
-//
-/* Sample code:
-
-__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
-  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
-    do_your_job_here;
-  }
+namespace tensorflow {
+__host__ __device__ inline tensorflow::bfloat16 CudaLdg(
+    const tensorflow::bfloat16* address) {
+  tensorflow::bfloat16 return_value;
+  return_value.value = CudaLdg(reinterpret_cast<const uint16_t*>(address));
+  return return_value;
 }
 
-__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
-      do_your_job_here;
-    }
-  }
+template <typename T>
+__host__ __device__ inline T ldg(const T* ptr) {
+  return CudaLdg(ptr);
 }
 
-__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
-      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
-        do_your_job_here;
-      }
-    }
-  }
+template <typename T>
+__host__ __device__ inline const T& tf_min(const T& x, const T& y) {
+  return x < y ? x : y;
 }
 
-void MyDriverFunc(const GPUDevice &d) {
-  // use heuristics
-  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
-  MyKernel1D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
-  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
-  MyKernel2D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
-  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
-  MyKernel3D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
-
-  // maximize occupancy
-  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
-  MyKernel1D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
-  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
-                                                  MyKernel1D, 0, 0);
-  MyKernel2D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
-  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
-                                                  MyKernel1D, 0, 0);
-  MyKernel3D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+template <typename T>
+__host__ __device__ inline const T& tf_max(const T& x, const T& y) {
+  return x < y ? y : x;
 }
 
-// See the test for this for more example:
-//
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
-
-*/
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define CUDA_AXIS_KERNEL_LOOP(i, n, axis)                                  \
-  for (int i = blockIdx.axis * blockDim.axis + threadIdx.axis; i < n.axis; \
-       i += blockDim.axis * gridDim.axis)
-
-#define DIV_UP(a, b) (((a) + (b)-1) / (b))
-
-namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-struct CudaLaunchConfig {
-  // Logical number of thread that works on the elements. If each logical
-  // thread works on exactly a single element, this is the same as the working
-  // element count.
-  int virtual_thread_count = -1;
-  // Number of threads per block.
-  int thread_per_block = -1;
-  // Number of blocks for Cuda kernel launch.
-  int block_count = -1;
-};
-
-// Calculate the Cuda launch config we should use for a kernel launch.
-// This is assuming the kernel is quite simple and will largely be
-// memory-limited.
-// REQUIRES: work_element_count > 0.
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const GPUDevice& d) {
-  CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
-  const int virtual_thread_count = work_element_count;
-  const int physical_thread_count = std::min(
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
-      virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
-  const int block_count =
-      std::min(DIV_UP(physical_thread_count, thread_per_block),
-               d.getNumCudaMultiProcessors());
-
-  config.virtual_thread_count = virtual_thread_count;
-  config.thread_per_block = thread_per_block;
-  config.block_count = block_count;
-  return config;
+// Overloads of the above functions for float and double.
+__host__ __device__ inline float tf_min(float x, float y) {
+  return fminf(x, y);
 }
-
-// Calculate the Cuda launch config we should use for a kernel launch. This
-// variant takes the resource limits of func into account to maximize occupancy.
-// REQUIRES: work_element_count > 0.
-template <typename DeviceFunc>
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const GPUDevice& d, DeviceFunc func,
-                                            size_t dynamic_shared_memory_size,
-                                            int block_size_limit) {
-  CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
-  int block_count = 0;
-  int thread_per_block = 0;
-
-  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
-      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-      block_size_limit);
-  CHECK_EQ(err, cudaSuccess);
-
-  block_count =
-      std::min(block_count, DIV_UP(work_element_count, thread_per_block));
-
-  config.virtual_thread_count = work_element_count;
-  config.thread_per_block = thread_per_block;
-  config.block_count = block_count;
-  return config;
+__host__ __device__ inline double tf_min(double x, double y) {
+  return fmin(x, y);
+}
+__host__ __device__ inline float tf_max(float x, float y) {
+  return fmaxf(x, y);
+}
+__host__ __device__ inline double tf_max(double x, double y) {
+  return fmax(x, y);
 }
 
-struct Cuda2DLaunchConfig {
-  dim3 virtual_thread_count = dim3(0, 0, 0);
-  dim3 thread_per_block = dim3(0, 0, 0);
-  dim3 block_count = dim3(0, 0, 0);
-};
-
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
-                                                const GPUDevice& d) {
-  Cuda2DLaunchConfig config;
-
-  if (xdim <= 0 || ydim <= 0) {
-    return config;
-  }
-
-  const int kThreadsPerBlock = 256;
-  int block_cols = std::min(xdim, kThreadsPerBlock);
-  // ok to round down here and just do more loops in the kernel
-  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
-
-  const int physical_thread_count =
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
-
-  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
-
-  config.virtual_thread_count = dim3(xdim, ydim, 1);
-  config.thread_per_block = dim3(block_cols, block_rows, 1);
-
-  int grid_x = std::min(DIV_UP(xdim, block_cols), max_blocks);
+__device__ inline Eigen::half CudaShuffleSync(unsigned mask, Eigen::half value,
+                                              int src_lane,
+                                              int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
+}
 
-  config.block_count = dim3(
-      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
-  return config;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleUpSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
 }
 
-// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
-// This variant takes the resource limits of func into account to maximize
-// occupancy.
-using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDownSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
+}
 
-template <typename DeviceFunc>
-inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
-    int xdim, int ydim, int zdim, const GPUDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  Cuda3DLaunchConfig config;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXorSync(
+    unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
+}
 
-  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
-    return config;
+namespace detail {
+// Overload of above function for half. Note that we don't have
+// atomicCAS() for anything less than 32 bits, so we need to include the
+// other 16 bits in the operation.
+//
+// This version is going to be very slow
+// under high concurrency, since most threads will be spinning on failing
+// their compare-and-swap tests. (The fact that we get false sharing on the
+// neighboring fp16 makes this even worse.) If you are doing a large reduction,
+// you are much better off with doing the intermediate steps in fp32 and then
+// switching to fp16 as late as you can in the calculations.
+//
+// Note: Assumes little endian.
+template <typename F>
+__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
+#endif
+  namespace half_impl = Eigen::half_impl;
+  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
+  assert(!(intptr & 0x1));  // should be 2-aligned.
+  if (intptr & 0x2) {
+    // The half is in the second part of the uint32 (upper 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short high = static_cast<unsigned short>(arg >> 16);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
+      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
+  } else {
+    // The half is in the first part of the uint32 (lower 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
+      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
   }
-
-  int dev;
-  cudaGetDevice(&dev);
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, dev);
-  int xthreadlimit = deviceProp.maxThreadsDim[0];
-  int ythreadlimit = deviceProp.maxThreadsDim[1];
-  int zthreadlimit = deviceProp.maxThreadsDim[2];
-  int xgridlimit = deviceProp.maxGridSize[0];
-  int ygridlimit = deviceProp.maxGridSize[1];
-  int zgridlimit = deviceProp.maxGridSize[2];
-
-  int block_count = 0;
-  int thread_per_block = 0;
-  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
-      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-      block_size_limit);
-  CHECK_EQ(err, cudaSuccess);
-
-#define MIN3(a, b, c) std::min((a), std::min((b), (c)))
-  int threadsx = MIN3(xdim, thread_per_block, xthreadlimit);
-  int threadsy =
-      MIN3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
-  int threadsz =
-      MIN3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
-           zthreadlimit);
-
-  int blocksx = MIN3(block_count, DIV_UP(xdim, threadsx), xgridlimit);
-  int blocksy =
-      MIN3(DIV_UP(block_count, blocksx), DIV_UP(ydim, threadsy), ygridlimit);
-  int blocksz = MIN3(DIV_UP(block_count, (blocksx * blocksy)),
-                     DIV_UP(zdim, threadsz), zgridlimit);
-#undef MIN3
-
-  config.virtual_thread_count = dim3(xdim, ydim, zdim);
-  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
-  config.block_count = dim3(blocksx, blocksy, blocksz);
-  return config;
 }
+}  // namespace detail
 
-template <typename DeviceFunc>
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
-    int xdim, int ydim, const GPUDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
-                               dynamic_shared_memory_size, block_size_limit);
+__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a + value; });
 }
-
-// Returns a raw reference to the current cuda stream.  Required by a
-// number of kernel calls (for which StreamInterface* does not work), i.e.
-// CUB and certain cublas primitives.
-inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
-  const cudaStream_t* ptr = CHECK_NOTNULL(
-      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
-                                                ->stream()
-                                                ->implementation()
-                                                ->CudaStreamMemberHack()));
-  return *ptr;
+__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a - value; });
 }
 
 namespace cuda_helper {
-
 template <typename IntType>
 __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
   IntType* orig = first;
@@ -330,495 +164,8 @@ __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
 
   return first - orig;
 }
-
 }  // namespace cuda_helper
-
-template <typename T>
-__device__ __host__ inline T ldg(const T* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg(address);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline std::complex<float> ldg(
-    const std::complex<float>* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
-  return std::complex<float>(mem.x, mem.y);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline std::complex<double> ldg(
-    const std::complex<double>* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
-  return std::complex<double>(mem.x, mem.y);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline Eigen::half ldg(const Eigen::half* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const uint16_t*>(address)));
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline tensorflow::bfloat16 ldg(
-    const tensorflow::bfloat16* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  tensorflow::bfloat16 return_value;
-  asm volatile("ld.global.nc.u16 %0, [%1];"
-               : "=h"(return_value.value)
-               : "l"(address));
-  return return_value;
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline bool ldg(const bool* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return *reinterpret_cast<const bool*>(
-      __ldg(reinterpret_cast<const char*>(address)));
-#else
-  return *address;
-#endif
-}
-
-// CUDA provides atomic ops, but not for all types.  We provide wrappers
-// for some ops and provide implementation for all reasonable types.
-#define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, T val)
-
-#define USE_CUDA_ATOMIC(op, T) \
-  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
-
-// For atomicAdd.
-USE_CUDA_ATOMIC(Add, int32);
-USE_CUDA_ATOMIC(Add, uint32);
-USE_CUDA_ATOMIC(Add, uint64);
-USE_CUDA_ATOMIC(Add, float);
-
-// For atomicMax.
-USE_CUDA_ATOMIC(Max, int32);
-USE_CUDA_ATOMIC(Max, uint32);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-USE_CUDA_ATOMIC(Max, uint64);
-#else
-// The uint64 overload of atomicMax() is only available for __CUDA_ARCH__ >=
-// 350.  If not satisfied, we provide a custom implementation using atomicCAS().
-CUDA_ATOMIC_WRAPPER(Max, uint64) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed, max(val, assumed));
-  } while (assumed != old);
-
-  return old;
-}
-#endif
-
-// Custom implementation of atomicAdd for double.
-// This implementation is copied from CUDA manual.
-CUDA_ATOMIC_WRAPPER(Add, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-
-// Custom implementation of atomicAdd for std::complex<float>.
-// This implementation performs to atomic additions on the components.
-CUDA_ATOMIC_WRAPPER(Add, std::complex<float>) {
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ >= 350
-  float2* addr_as_float2 = reinterpret_cast<float2*>(address);
-  float2* val_as_float2 = reinterpret_cast<float2*>(&val);
-  CudaAtomicAdd(&(addr_as_float2->x), val_as_float2->x);
-  CudaAtomicAdd(&(addr_as_float2->y), val_as_float2->y);
-#else
-  static_assert(sizeof(std::complex<float>) == 2 * sizeof(float),
-                "Unable to compile CudaAtomicAdd for complex64 because "
-                "sizeof(complex64) != 2*sizeof(float32)");
-  float* addr_as_float = reinterpret_cast<float*>(address);
-  float* val_as_float = reinterpret_cast<float*>(&val);
-  CudaAtomicAdd(addr_as_float, *val_as_float);
-  CudaAtomicAdd(addr_as_float + 1, *(val_as_float + 1));
-#endif
-#endif
-  return *address;
-}
-
-// Custom implementation of atomicAdd for std::complex<double>.
-// This implementation performs to atomic additions on the components
-// using the double atomic wrapper above.
-CUDA_ATOMIC_WRAPPER(Add, complex128) {
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ >= 350
-  double2* addr_as_double2 = reinterpret_cast<double2*>(address);
-  double2* val_as_double2 = reinterpret_cast<double2*>(&val);
-  CudaAtomicAdd(&(addr_as_double2->x), val_as_double2->x);
-  CudaAtomicAdd(&(addr_as_double2->y), val_as_double2->y);
-#else
-  static_assert(sizeof(std::complex<double>) == 2 * sizeof(double),
-                "Unable to compile CudaAtomicAdd for complex128 because "
-                "sizeof(complex128) != 2*sizeof(float64)");
-  double* addr_as_double = reinterpret_cast<double*>(address);
-  double* val_as_double = reinterpret_cast<double*>(&val);
-  CudaAtomicAdd(addr_as_double, *val_as_double);
-  CudaAtomicAdd(addr_as_double + 1, *(val_as_double + 1));
-#endif
-#endif
-  return *address;
-}
-
-// Helper functions for CudaAtomicAdd(half*, half), below.
-//
-// Note that if __CUDA_ARCH__ >= 530, we could probably use __hadd2()
-// for a more efficient implementation, assuming that adding -0.0
-// will never harm the neighboring value. In this version, we take special
-// care to guarantee the bits of the untouched value are unchanged.
-inline __device__ uint32 add_to_low_half(uint32 val, float x) {
-  Eigen::half low_half;
-  low_half.x = static_cast<uint16>(val & 0xffffu);
-  low_half = static_cast<Eigen::half>(static_cast<float>(low_half) + x);
-  return (val & 0xffff0000u) | low_half.x;
-}
-
-inline __device__ uint32 add_to_high_half(uint32 val, float x) {
-  Eigen::half high_half;
-  high_half.x = static_cast<uint16>(val >> 16);
-  high_half = static_cast<Eigen::half>(static_cast<float>(high_half) + x);
-  return (val & 0xffffu) | (high_half.x << 16);
-}
-
-// Custom implementation of atomicAdd for half. Note that we don't have
-// atomicCAS() for anything less than 32 bits, so we need to include the
-// other 16 bits in the operation.
-//
-// Unlike the other atomic adds, this version is going to be very slow
-// under high concurrency, since most threads will be spinning on failing
-// their compare-and-swap tests. (The fact that we get false sharing on the
-// neighboring fp16 makes this even worse.) If you are doing a large reduction,
-// you are much better off with doing the intermediate steps in fp32 and then
-// switching to fp16 as late as you can in the calculations.
-//
-// Note: Assumes little endian.
-CUDA_ATOMIC_WRAPPER(Add, Eigen::half) {
-  float val_as_float(val);
-  intptr_t address_int = reinterpret_cast<intptr_t>(address);
-  if ((address_int & 0x2) == 0) {
-    // The half is in the first part of the uint32 (lower 16 bits).
-    uint32* address_as_uint32 = reinterpret_cast<uint32*>(address);
-    assert(((intptr_t)address_as_uint32 & 0x3) == 0);
-    uint32 old = *address_as_uint32, assumed;
-
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_uint32, assumed,
-                      add_to_low_half(assumed, val_as_float));
-
-      // Note: uses integer comparison to avoid hang in case of NaN
-    } while (assumed != old);
-
-    Eigen::half ret;
-    ret.x = old & 0xffffu;
-    return ret;
-  } else {
-    // The half is in the second part of the uint32 (upper 16 bits).
-    uint32* address_as_uint32 = reinterpret_cast<uint32*>(address_int - 2);
-    assert(((intptr_t)address_as_uint32 & 0x3) == 0);
-    uint32 old = *address_as_uint32, assumed;
-
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_uint32, assumed,
-                      add_to_high_half(assumed, val_as_float));
-
-      // Note: uses integer comparison to avoid hang in case of NaN
-    } while (assumed != old);
-
-    Eigen::half ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-
-template <typename T>
-__global__ void SetZero(const int nthreads, T* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) { *(bottom_diff + index) = T(0); }
-}
-
-// For atomicSub.
-
-// Custom implementation for sub by just negating the value.
-#define WRAPPED_ATOMIC_SUB(T) \
-  CUDA_ATOMIC_WRAPPER(Sub, T) { return CudaAtomicAdd(address, -val); }
-
-WRAPPED_ATOMIC_SUB(uint64);
-WRAPPED_ATOMIC_SUB(int32);
-WRAPPED_ATOMIC_SUB(uint32);
-WRAPPED_ATOMIC_SUB(Eigen::half);
-WRAPPED_ATOMIC_SUB(float);
-WRAPPED_ATOMIC_SUB(double);
-
-CUDA_ATOMIC_WRAPPER(Sub, complex64) {
-  const std::complex<float> Tneg(-val.real(), -val.imag());
-  return CudaAtomicAdd(address, Tneg);
-}
-
-CUDA_ATOMIC_WRAPPER(Sub, complex128) {
-  const std::complex<double> Tneg(-val.real(), -val.imag());
-  return CudaAtomicAdd(address, Tneg);
-}
-
-#undef WRAPPED_ATOMIC_SUB
-
-// For atomicMul.
-CUDA_ATOMIC_WRAPPER(Mul, int32) {
-  int32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, uint32) {
-  uint32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, uint64) {
-  uint64 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, float) {
-  int32* address_as_int = reinterpret_cast<int32*>(address);
-  int32 old = *address_as_int, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_int, assumed,
-                    __float_as_int(val * __int_as_float(assumed)));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val * __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-// For atomicDiv.
-CUDA_ATOMIC_WRAPPER(Div, int32) {
-  int32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, uint32) {
-  uint32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, uint64) {
-  uint64 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, float) {
-  int32* address_as_int = reinterpret_cast<int32*>(address);
-  int32 old = *address_as_int, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_int, assumed,
-                    __float_as_int(__int_as_float(assumed) / val));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Div, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(__longlong_as_double(assumed) / val));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-#undef USE_CUDA_ATOMIC
-#undef CUDA_ATOMIC_WRAPPER
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_min(const T& x, const T& y) {
-  return x > y ? y : x;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_max(const T& x, const T& y) {
-  return x < y ? y : x;
-}
-
-__device__ EIGEN_ALWAYS_INLINE unsigned CudaBallot(unsigned mask,
-                                                   int predicate) {
-  return __ballot_sync(mask, predicate);
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(unsigned mask, T value,
-                                             int srcLane,
-                                             int width = warpSize) {
-  return __shfl_sync(mask, value, srcLane, width);
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(unsigned mask, double value,
-                                                  int srcLane,
-                                                  int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_sync(mask, hi, srcLane, width);
-  lo = __shfl_sync(mask, lo, srcLane, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(unsigned mask, T value,
-                                               int delta,
-                                               int width = warpSize) {
-  return __shfl_up_sync(mask, value, delta, width);
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(unsigned mask, double value,
-                                                    int delta,
-                                                    int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_up_sync(mask, hi, delta, width);
-  lo = __shfl_up_sync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask, T value,
-                                                 int delta,
-                                                 int width = warpSize) {
-  return __shfl_down_sync(mask, value, delta, width);
-}
-
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDown(
-    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
-  return Eigen::half(
-      __shfl_down_sync(mask, static_cast<uint16>(value), delta, width));
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(unsigned mask,
-                                                      double value, int delta,
-                                                      int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_down_sync(mask, hi, delta, width);
-  lo = __shfl_down_sync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask, T value,
-                                                int laneMask,
-                                                int width = warpSize) {
-  return __shfl_xor_sync(mask, value, laneMask, width);
-}
-
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXor(
-    unsigned mask, Eigen::half value, int laneMask, int width = warpSize) {
-  return Eigen::half(
-      __shfl_xor_sync(mask, static_cast<uint16>(value), laneMask, width));
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(unsigned mask,
-                                                     double value, int laneMask,
-                                                     int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_xor_sync(mask, hi, laneMask, width);
-  lo = __shfl_xor_sync(mask, lo, laneMask, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
 }  // namespace tensorflow
 
-#undef DIV_UP
-
 #endif  // GOOGLE_CUDA
-
 #endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index 6991554effd9088c04bfcb71f274b82408507463..732ed33ede17bc90d3301d3f1eee6302a96028d7 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -52,11 +52,11 @@ __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
     }
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
       if (y < 0) {  // y might overflow when testing extreme case
         break;
       }
@@ -66,15 +66,15 @@ __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
     }
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
       if (y < 0) {  // y might overflow when testing extreme case
         break;
       }
-      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {
         if (z < 0) {  // z might overflow when testing extreme case
           break;
         }
@@ -87,6 +87,44 @@ __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 
+__global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) {
+  unsigned lane_id = CudaLaneId();
+  for (int width = warpSize; width > 1; width /= 2) {
+    auto check_result = [&](const char* op_name, int param, unsigned actual,
+                            unsigned expected) {
+      if (actual != expected) {
+        printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
+               op_name, param, width, lane_id, actual, expected);
+        CudaAtomicAdd(failure_count, 1);
+      }
+    };
+    for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
+      unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
+      unsigned expect_lane =
+          CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
+      check_result("Shuffle", src_lane, actual_lane, expect_lane);
+    }
+    for (unsigned delta = 0; delta <= warpSize; ++delta) {
+      unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
+      unsigned expect_lane =
+          CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
+      check_result("ShuffleUp", delta, actual_lane, expect_lane);
+    }
+    for (unsigned delta = 0; delta <= warpSize; ++delta) {
+      unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
+      unsigned expect_lane =
+          CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
+      check_result("ShuffleDown", delta, actual_lane, expect_lane);
+    }
+    for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
+      unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
+      unsigned expect_lane =
+          CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
+      check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
+    }
+  }
+}
+
 }  // namespace
 
 class CudaLaunchConfigTest : public ::testing::Test {
@@ -94,7 +132,7 @@ class CudaLaunchConfigTest : public ::testing::Test {
   const int bufsize = 1024;
   int* outbuf = nullptr;
   Eigen::CudaStreamDevice stream;
-  GPUDevice d = GPUDevice(&stream);
+  Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
   virtual void SetUp() {
     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
@@ -111,27 +149,27 @@ class CudaLaunchConfigTest : public ::testing::Test {
 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   CudaLaunchConfig cfg;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(work_element_count)                             \
-    cfg = GetCudaLaunchConfig(bufsize, d);                                      \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d);                           \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0));\
-                                                                                \
-    cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);            \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(work_element_count)                              \
+  cfg = GetCudaLaunchConfig(bufsize, d);                                       \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d);                            \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
+                                                                               \
+  cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128);
   TEST_LAUNCH_PARAMETER(129);
@@ -143,7 +181,7 @@ TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   TEST_LAUNCH_PARAMETER(8192);
   TEST_LAUNCH_PARAMETER(123456);
   TEST_LAUNCH_PARAMETER(1 << 30);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
@@ -162,27 +200,27 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   Cuda2DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
-    cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0));       \
-                                                                                \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy)                                      \
+  cfg1d = GetCudaLaunchConfig(bufsize, d);                                     \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                  \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
+                                                                               \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                   \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
   TEST_LAUNCH_PARAMETER(129, 64);
@@ -195,24 +233,24 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(123456, 12);
   TEST_LAUNCH_PARAMETER(1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1 << 30, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   Cuda3DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
-    Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy * dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                                \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);             \
+  Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
   TEST_LAUNCH_PARAMETER(129, 64, 1024);
@@ -226,7 +264,17 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
   TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
+}
+
+TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
+  unsigned* failure_count;
+  ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
+  *failure_count = 0;
+  CudaShuffleGetSrcLaneTest<<<1, 32>>>(failure_count);
+  ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  ASSERT_EQ(*failure_count, 0);
+  cudaFree(failure_count);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ea33ee6cf2195cc0192c59d694672f0d4c69a56
--- /dev/null
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
+#define TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
+
+#if GOOGLE_CUDA
+
+#include <algorithm>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "cuda/include/cuda.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
+// GetCuda3DLaunchConfig:
+//
+// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
+// version uses heuristics without any knowledge of the device kernel, the other
+// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
+// launch parameters that maximize occupancy. Currently, only the maximum
+// occupancy version of GetCuda3DLaunchConfig is available.
+//
+// For large number of work elements, the convention is that each kernel would
+// iterate through its assigned range. The return value of GetCudaLaunchConfig
+// is struct CudaLaunchConfig, which contains all the information needed for the
+// kernel launch, including: virtual number of threads, the number of threads
+// per block and number of threads per block used inside <<< >>> of a kernel
+// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
+// as CudaLaunchConfig. The only difference is the dimension. The macros
+// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
+//
+/* Sample code:
+
+__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
+  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
+    do_your_job_here;
+  }
+}
+
+__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      do_your_job_here;
+    }
+  }
+}
+
+__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+        do_your_job_here;
+      }
+    }
+  }
+}
+
+void MyDriverFunc(const Eigen::GpuDevice &d) {
+  // use heuristics
+  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
+  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
+  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
+
+  // maximize occupancy
+  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
+  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
+  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+}
+
+// See the test for this for more example:
+//
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+*/
+
+namespace tensorflow {
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+struct CudaLaunchConfig {
+  // Logical number of thread that works on the elements. If each logical
+  // thread works on exactly a single element, this is the same as the working
+  // element count.
+  int virtual_thread_count = -1;
+  // Number of threads per block.
+  int thread_per_block = -1;
+  // Number of blocks for Cuda kernel launch.
+  int block_count = -1;
+};
+
+// Calculate the Cuda launch config we should use for a kernel launch.
+// This is assuming the kernel is quite simple and will largely be
+// memory-limited.
+// REQUIRES: work_element_count > 0.
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d) {
+  CHECK_GT(work_element_count, 0);
+  CudaLaunchConfig config;
+  const int virtual_thread_count = work_element_count;
+  const int physical_thread_count = std::min(
+      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
+      virtual_thread_count);
+  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
+  const int block_count =
+      std::min(DivUp(physical_thread_count, thread_per_block),
+               d.getNumCudaMultiProcessors());
+
+  config.virtual_thread_count = virtual_thread_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
+// Calculate the Cuda launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+// REQUIRES: work_element_count > 0.
+template <typename DeviceFunc>
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d,
+                                            DeviceFunc func,
+                                            size_t dynamic_shared_memory_size,
+                                            int block_size_limit) {
+  CHECK_GT(work_element_count, 0);
+  CudaLaunchConfig config;
+  int block_count = 0;
+  int thread_per_block = 0;
+
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+  block_count =
+      std::min(block_count, DivUp(work_element_count, thread_per_block));
+
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
+struct Cuda2DLaunchConfig {
+  dim3 virtual_thread_count = dim3(0, 0, 0);
+  dim3 thread_per_block = dim3(0, 0, 0);
+  dim3 block_count = dim3(0, 0, 0);
+};
+
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                                const Eigen::GpuDevice& d) {
+  Cuda2DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0) {
+    return config;
+  }
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = std::min(xdim, kThreadsPerBlock);
+  // ok to round down here and just do more loops in the kernel
+  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
+
+  const int physical_thread_count =
+      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
+
+  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
+
+  config.virtual_thread_count = dim3(xdim, ydim, 1);
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
+
+  config.block_count = dim3(
+      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
+  return config;
+}
+
+// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
+// This variant takes the resource limits of func into account to maximize
+// occupancy.
+using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+
+template <typename DeviceFunc>
+inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
+    int xdim, int ydim, int zdim, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  Cuda3DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
+    return config;
+  }
+
+  int dev;
+  cudaGetDevice(&dev);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, dev);
+  int xthreadlimit = deviceProp.maxThreadsDim[0];
+  int ythreadlimit = deviceProp.maxThreadsDim[1];
+  int zthreadlimit = deviceProp.maxThreadsDim[2];
+  int xgridlimit = deviceProp.maxGridSize[0];
+  int ygridlimit = deviceProp.maxGridSize[1];
+  int zgridlimit = deviceProp.maxGridSize[2];
+
+  int block_count = 0;
+  int thread_per_block = 0;
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+  auto min3 = [](int a, int b, int c) { return std::min(a, std::min(b, c)); };
+
+  int threadsx = min3(xdim, thread_per_block, xthreadlimit);
+  int threadsy =
+      min3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
+  int threadsz =
+      min3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
+           zthreadlimit);
+
+  int blocksx = min3(block_count, DivUp(xdim, threadsx), xgridlimit);
+  int blocksy =
+      min3(DivUp(block_count, blocksx), DivUp(ydim, threadsy), ygridlimit);
+  int blocksz = min3(DivUp(block_count, (blocksx * blocksy)),
+                     DivUp(zdim, threadsz), zgridlimit);
+
+  config.virtual_thread_count = dim3(xdim, ydim, zdim);
+  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
+  config.block_count = dim3(blocksx, blocksy, blocksz);
+  return config;
+}
+
+template <typename DeviceFunc>
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
+    int xdim, int ydim, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
+                               dynamic_shared_memory_size, block_size_limit);
+}
+
+// Returns a raw reference to the current cuda stream.  Required by a
+// number of kernel calls (for which StreamInterface* does not work), i.e.
+// CUB and certain cublas primitives.
+inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
+  const cudaStream_t* ptr = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  return *ptr;
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 9b6a8e12511448b72e17a0b20a4418c4a5cd2c7a..13e41c17f7c7df5ad581bd3f6a39051641139258 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -57,6 +57,7 @@ void TestCorrectness(const string& serialized) {
   Example example;
   Example fast_example;
   EXPECT_TRUE(example.ParseFromString(serialized));
+  example.DiscardUnknownFields();
   EXPECT_TRUE(TestFastParse(serialized, &fast_example));
   EXPECT_EQ(example.DebugString(), fast_example.DebugString());
   if (example.DebugString() != fast_example.DebugString()) {
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 41f56d2daa48e651f5ac4051deae9c05ef1ed859..e156a3bc8f0f01acc543e9b385bd9782870be52a 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -247,8 +247,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
@@ -278,8 +279,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 616eb5dac32188688ac01cf49ff583dc1623d5ad..504d2d353f8f76f77e4efd3e4a6a6edcaa200711 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -144,8 +144,8 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp));
   // Making sure to clean up after the test finishes.
   const auto adh = [&memmapped_env, &filename](WritableFile* f) {
-      delete f;
-      TF_CHECK_OK(memmapped_env.DeleteFile(filename));
+    delete f;
+    TF_CHECK_OK(memmapped_env.DeleteFile(filename));
   };
   std::unique_ptr<WritableFile, decltype(adh)> writable_file(
       writable_file_temp.release(), adh);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 2caf5fc56dafb5a8879db8026a78bc7bf46346a4..4467373c0060bb4dd25108891e2ff51d903a2453 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::engine;
@@ -210,31 +210,32 @@ class MklShape {
     CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
   }
 
-// The following methods are used for serializing and de-serializing the
-// contents of the mklshape object.
-// The data is serialized in this order
-// isMklTensor_
-// dimension_
-// sizes_
-// strides_
-// mklLayout_
-// tfLayout_
-// tf_to_mkl_dim_map_
+  // The following methods are used for serializing and de-serializing the
+  // contents of the mklshape object.
+  // The data is serialized in this order
+  // isMklTensor_
+  // dimension_
+  // sizes_
+  // strides_
+  // mklLayout_
+  // tfLayout_
+  // tf_to_mkl_dim_map_
 
 #define SIZE_OF_MKL_DNN_BUF \
   (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
                                             // serialize dnn_layout pointer
 
-// Size of buffer to hold the serialized object, the size is computed as follows
-// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_)
-// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
-// + sizeof(tf_to_mkl_dim_map_)
+  // Size of buffer to hold the serialized object, the size is computed as
+  // follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) +
+  // sizeof(strides_)
+  // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+  // + sizeof(tf_to_mkl_dim_map_)
 
 #define SIZE_OF_MKL_SERIAL_DATA(dims) \
   (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
 
-// First we need to define some macro for offsets into the serial buffer where
-// different elements of Mklshape is written/read from
+  // First we need to define some macro for offsets into the serial buffer where
+  // different elements of Mklshape is written/read from
 
 #define IS_MKL_TENSOR_OFFSET 0
 // Location from start of buffer where isMklTensor_ is serialized
@@ -324,7 +325,7 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
@@ -388,7 +389,7 @@ class MklDnnShape {
 
   /// Equality function for MklDnnShape objects
   /// @return true if both are equal; false otherwise.
-  inline bool operator == (const MklDnnShape& input_shape) const {
+  inline bool operator==(const MklDnnShape& input_shape) const {
     if (this->IsMklTensor() != input_shape.IsMklTensor()) {
       return false;
     }
@@ -406,7 +407,7 @@ class MklDnnShape {
 
   /// Equality operator for MklDnnShape and TFShape.
   /// Returns: true if TF shapes for both are the same, false otherwise
-  inline bool operator == (const TensorShape& input_shape) const {
+  inline bool operator==(const TensorShape& input_shape) const {
     if (!this->IsMklTensor()) {
       return false;
     }
@@ -425,7 +426,7 @@ class MklDnnShape {
   inline size_t GetDimension(char dimension) const {
     int index = GetMklDnnTensorDimIndex(dimension);
     CHECK(index >= 0 && index < this->GetDimension())
-      << "Invalid index from the dimension: " << index << ", " << dimension;
+        << "Invalid index from the dimension: " << index << ", " << dimension;
     return this->DimSize(index);
   }
 
@@ -659,7 +660,7 @@ class MklDnnShape {
 
 typedef std::vector<MklShape> MklShapeList;
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 typedef std::vector<MklDnnShape> MklDnnShapeList;
 #endif
 
@@ -673,7 +674,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -705,8 +706,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   Tensor output_tensor;
   TensorShape output_shape;
 
-  TF_CHECK_OK(Status(error::Code::UNIMPLEMENTED,
-                     "Unimplemented conversion function"));
+  TF_CHECK_OK(
+      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
 
   return output_tensor;
 }
@@ -724,7 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
           sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
   mklshape->DeSerializeMklDnnShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -749,7 +750,7 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
 }
 
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
@@ -779,7 +780,7 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
 
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
@@ -814,7 +815,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -851,7 +852,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -875,7 +876,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            const memory::primitive_desc& pd, void** buf_out) {
@@ -973,8 +974,8 @@ inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   return mkl_shape.dim_size(index);
 }
 
-inline void CopyMklTensorInToOut(OpKernelContext* context,
-                                 int idx_in, int idx_out) {
+inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                 int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -994,9 +995,9 @@ inline void CopyMklTensorInToOut(OpKernelContext* context,
   context->set_output(idx_meta_out, meta_output);
 }
 
-#ifndef INTEL_MKL_DNN
-inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
-                                         int idx_in, int idx_out,
+#ifdef INTEL_MKL_ML
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
                                          const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
@@ -1013,8 +1014,8 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
   context->set_output(idx_data_out, output);
 }
 #else
-inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
-                                         int idx_in, int idx_out,
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
                                          const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
@@ -1032,10 +1033,10 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
 }
 #endif
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
-inline void ForwardTfTensorInToOut(OpKernelContext* context,
-                                  int idx_in, int idx_out) {
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1053,8 +1054,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
 
 #else
 
-inline void ForwardTfTensorInToOut(OpKernelContext* context,
-                                  int idx_in, int idx_out) {
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1072,8 +1073,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
 
 #endif
 
-inline void ForwardMklTensorInToOut(OpKernelContext* context,
-                                   int idx_in, int idx_out) {
+inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                    int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1090,10 +1091,10 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context,
   }
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
-                                             int idx_in, int idx_out,
-                                             const MklDnnShape& mkl_shape) {
+                                                int idx_in, int idx_out,
+                                                const MklDnnShape& mkl_shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1132,7 +1133,7 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // We don't need these functions in MKLDNN. We have defined equality operator
 // on MklDnnShape class directly.
 
@@ -1216,11 +1217,11 @@ inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
   int64 H = input.dim_size(1);
   int64 W = input.dim_size(2);
   int64 C = input.dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C,
-        buf_out + n*stride_n, H*W);
+    mkl_somatcopy('R', 'T', H * W, C, 1, buf_in + n * stride_n, C,
+                  buf_out + n * stride_n, H * W);
   }
 }
 
@@ -1232,17 +1233,17 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
   int64 H = (*output)->dim_size(1);
   int64 W = (*output)->dim_size(2);
   int64 C = (*output)->dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W,
-        buf_out + n*stride_n, C);
+    mkl_somatcopy('R', 'T', C, H * W, 1, buf_in + n * stride_n, H * W,
+                  buf_out + n * stride_n, C);
   }
 }
 
 // -------------------------------------------------------------------
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
@@ -1279,10 +1280,11 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @return: Tensorflow data format corresponding to memory::format
 ///          Fails with an error if invalid data format.
 inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
-  if (format == memory::format::nhwc) return FORMAT_NHWC;
-  else if (format == memory::format::nchw) return FORMAT_NCHW;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
-                     "Unsupported data format"));
+  if (format == memory::format::nhwc)
+    return FORMAT_NHWC;
+  else if (format == memory::format::nchw)
+    return FORMAT_NCHW;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
   // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
   // that we don't come here.
@@ -1425,7 +1427,6 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
   return memory::desc(md);
 }
 
-
 /*
  * Class to represent all the resources corresponding to a tensor in TensorFlow
  * that are required to execute an operation (such as Convolution).
@@ -1494,7 +1495,7 @@ class MklDnnData {
   /// @return: memory::desc object corresponding to blocked memory format
   ///          for given dimensions and strides.
   static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
-                                                 const memory::dims& strides) {
+                                                  const memory::dims& strides) {
     return CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>());
   }
 
@@ -1563,7 +1564,6 @@ class MklDnnData {
     return user_memory_->get_primitive_desc();
   }
 
-
   /// Get function for descriptor of user memory.
   inline memory::desc GetUsrMemDesc() {
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
@@ -1634,7 +1634,8 @@ class MklDnnData {
   /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool IsReorderNeeded(const memory::format& target_format) const {
     CHECK_NOTNULL(user_memory_);
-    return target_format != user_memory_->get_primitive_desc().desc().data.format;
+    return target_format !=
+           user_memory_->get_primitive_desc().desc().data.format;
   }
 
   /// Function to create a reorder from memory pointed by from to memory pointed
@@ -1753,7 +1754,7 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 8b73eadb40046518179fcaaa5c244aa7f3d52ebe..cd1d0713ad58b594005847f48943a228743e530d 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 TEST(MklUtilTest, MklDnnTfShape) {
   auto cpu_engine = engine(engine::cpu, 0);
@@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index e7dab830f0ec9e3401d621f04358d3ee62cb0b63..f88ad2faaff344832d65b04357c3d8c2665ebad5 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -67,7 +67,7 @@ inline uint64 multiply_high_u64(uint64 x, uint64 y) {
   return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry;
 #endif
 }
-}
+}  // namespace presized_cuckoo_map
 
 template <class value>
 class PresizedCuckooMap {
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 1cb07718feee820c334d8f5183cafb2de0cb009b..575c27d4ef72ec33c4b9352de59fc806b12d6385 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,8 +29,8 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST(TestReporter, NoLogging) {
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index f2401a0af4e60f66c606e86e90a37bcf09eb6308..258ee418c145bae161c7603d4249875fb687c94a 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/sparse/dim_comparator.h"
 #include "tensorflow/core/util/sparse/group_iterator.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
@@ -59,8 +59,8 @@ class SparseTensor {
         shape_(shape.begin(), shape.end()),
         order_(order.begin(), order.end()),
         dims_(GetDimsFromIx(ix)) {
-    CHECK_EQ(ix.dtype(), DT_INT64) << "indices must be type int64 but got: "
-                                   << ix.dtype();
+    CHECK_EQ(ix.dtype(), DT_INT64)
+        << "indices must be type int64 but got: " << ix.dtype();
     CHECK(TensorShapeUtils::IsVector(vals.shape()))
         << "vals must be a vec, but got: " << vals.shape().DebugString();
     CHECK_EQ(ix.shape().dim_size(0), vals.shape().dim_size(0))
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index efdd97fd3d6ffa5c1f66f2a0950d7bd44ba01eb1..85de0320857e307ea54594c2eff611b9e413945b 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index 6a5ddec04c9d6c2f723e0caa7343103f09c63183..f7767ace716782e53a2023bea7acc7b2f3c6604c 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -41,9 +41,10 @@ class StreamExecutorUtil {
   // This assumes that the error codes between the two implementations
   // match.
   static Status ConvertStatus(const perftools::gputools::port::Status& s) {
-    return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                              static_cast<int>(s.code())),
-                                          s.error_message());
+    return s.ok() ? Status::OK()
+                  : Status(static_cast<tensorflow::error::Code>(
+                               static_cast<int>(s.code())),
+                           s.error_message());
   }
 };
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 579b70ab5149f05749205f24a0c6e64c95f12dfd..462b420976e63ca63079fd652fdb12c5ef2a1404 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -913,8 +913,8 @@ Status BundleReader::LookupSlice(StringPiece full_tensor_key,
 Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
                                    const BundleEntryProto& full_tensor_entry,
                                    const TensorSlice& slice_spec, Tensor* val) {
-  using checkpoint::TensorSliceSet;
   using checkpoint::RegisterTensorSlice;
+  using checkpoint::TensorSliceSet;
   DCHECK_GE(full_tensor_entry.slices_size(), 0);
 
   const TensorShape full_shape(TensorShape(full_tensor_entry.shape()));
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index 0f009d7de57a3cf1471c1ba694d3a771bc00635c..424f8098a9c1e3cec3851be06d04d49bed93e9af 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -55,7 +55,7 @@ const TensorSliceReader* TensorSliceReaderCache::GetReader(
     TensorSliceReader::OpenTableFunction open_function, int preferred_shard) {
   mutex_lock l(mu_);
 
-#if defined(__GXX_RTTI) ||  defined(_CPPRTTI)
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
   // Get the function pointer from the open_function value.
   TensorSliceReaderCache::OpenFuncType* func_ptr =
       open_function.target<TensorSliceReaderCache::OpenFuncType>();
diff --git a/tensorflow/core/util/tensor_slice_set.cc b/tensorflow/core/util/tensor_slice_set.cc
index 4217df90ca147ccc17cadf6c46c6e4ef4524f12b..7c1d325c0a54e7ba5261f645a2962970fa2d3630 100644
--- a/tensorflow/core/util/tensor_slice_set.cc
+++ b/tensorflow/core/util/tensor_slice_set.cc
@@ -188,9 +188,9 @@ Status RegisterTensorSlice(
     }
     if (type != tss->type()) {
       return errors::Internal("Incompatible tensor types detected for tensor ",
-                              name, ": existing = ",
-                              DataTypeString(tss->type()), ", new = ",
-                              DataTypeString(type));
+                              name,
+                              ": existing = ", DataTypeString(tss->type()),
+                              ", new = ", DataTypeString(type));
     }
   }
   // Register the tensor slices without the actual data.
diff --git a/tensorflow/core/util/tensor_slice_util.h b/tensorflow/core/util/tensor_slice_util.h
index c7edae66b267d4cbd88d497c745b4d81802ab3a9..8f5a6f1d93591e94ec759d343ec26146c67552c0 100644
--- a/tensorflow/core/util/tensor_slice_util.h
+++ b/tensorflow/core/util/tensor_slice_util.h
@@ -139,9 +139,9 @@ static bool CopyDataFromTensorSliceToTensorSlice(const TensorShape& shape,
                                                  const TensorSlice& slice_d,
                                                  const SrcT* ptr_s,
                                                  DstT* ptr_d) {
-  CHECK_LE(shape.dims(), kTensorSliceMaxRank) << "Only tensors of size up to "
-                                              << kTensorSliceMaxRank
-                                              << " are supported";
+  CHECK_LE(shape.dims(), kTensorSliceMaxRank)
+      << "Only tensors of size up to " << kTensorSliceMaxRank
+      << " are supported";
   // We need to compute the intersection of the two slices.
   TensorSlice inter;
   if (!slice_s.Intersect(slice_d, &inter)) {
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index bdb4921e1bbf8611d84420c1e52d01fa39c25264..2888c66d10fa3c2ab0eaf755a23da3eb3fcd6b09 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -101,8 +101,8 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
   // The tensor and the slice have to be compatible
   if (shape.dims() != slice.dims()) {
     return errors::Internal("Incompatible tensor shape and slice: ", "shape = ",
-                            shape.DebugString(), ", slice = ",
-                            slice.DebugString());
+                            shape.DebugString(),
+                            ", slice = ", slice.DebugString());
   }
   DataType dt = DataTypeToEnum<T>::value;
   // We need to add an entry for "name" if there isn't an entry already.
@@ -114,9 +114,9 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
     CHECK_EQ(name, ssm.name()) << ProtoShortDebugString(ssm);
     TensorShape ssm_shape(ssm.shape());
     if (!shape.IsSameSize(ssm_shape)) {
-      return errors::Internal("Mismatching shapes: existing tensor = ",
-                              ssm_shape.DebugString(), ", trying to add name ",
-                              name, ", shape = ", shape.DebugString());
+      return errors::Internal(
+          "Mismatching shapes: existing tensor = ", ssm_shape.DebugString(),
+          ", trying to add name ", name, ", shape = ", shape.DebugString());
     }
     if (dt != ssm.type()) {
       return errors::Internal(
diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md
index c9f0c532c62791a9fcf854f11fd2f330955ee7d6..5593a3d95c435df38174fde5db37f4dd3437acd4 100644
--- a/tensorflow/docs_src/about/bib.md
+++ b/tensorflow/docs_src/about/bib.md
@@ -60,7 +60,7 @@ author={
     Lukasz~Kaiser and
     Manjunath~Kudlur and
     Josh~Levenberg and
-    Dan~Man\'{e} and
+    Dandelion~Man\'{e} and
     Rajat~Monga and
     Sherry~Moore and
     Derek~Murray and
diff --git a/tensorflow/docs_src/api_guides/python/TPUEstimator.md b/tensorflow/docs_src/api_guides/python/TPUEstimator.md
new file mode 100644
index 0000000000000000000000000000000000000000..d74d7f3181c9cf44e6c97e13742db682858f4694
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/TPUEstimator.md
@@ -0,0 +1,396 @@
+# Using TPUs
+
+This document walks through the principal TensorFlow APIs necessary to make
+effective use of a [Cloud TPU](https://cloud.google.com/tpu/), and highlights
+the differences between regular TensorFlow usage, and usage on a TPU.
+
+This doc is aimed at users who:
+
+* Are familiar with TensorFlow's `Estimator` and `Dataset` APIs
+* Have maybe [tried out a Cloud TPU](https://cloud.google.com/tpu/docs/quickstart)
+  using an existing model.
+* Have, perhaps, skimmed the code of an example TPU model
+  [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
+  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+* Are interested in porting an existing `Estimator` model to
+  run on Cloud TPUs
+
+## TPUEstimator
+
+@{tf.estimator.Estimator$Estimators} are TensorFlow's model-level abstraction.
+Standard `Estimators` can drive models on CPU and GPUs. You must use
+@{tf.contrib.tpu.TPUEstimator} to drive a model on TPUs.
+
+Refer to TensorFlow's Getting Started section for an introduction to the basics
+of using a @{$get_started/premade_estimators$pre-made `Estimator`}, and
+@{$get_started/custom_estimators$custom `Estimator`s}.
+
+The `TPUEstimator` class differs somewhat from the `Estimator` class.
+
+The simplest way to maintain a model that can be run both on CPU/GPU or on a
+Cloud TPU is to define the model's inference phase (from inputs to predictions)
+outside of the `model_fn`. Then maintain separate implementations of the
+`Estimator` setup and `model_fn`, both wrapping this inference step. For an
+example of this pattern compare the `mnist.py` and `mnist_tpu.py` implementation in
+[tensorflow/models](https://github.com/tensorflow/models/tree/master/official/mnist).
+
+### Running a `TPUEstimator` locally
+
+To create a standard `Estimator` you call the constructor, and pass it a
+`model_fn`, for example:
+
+```
+my_estimator = tf.estimator.Estimator(
+  model_fn=my_model_fn)
+```
+
+The changes required to use a @{tf.contrib.tpu.TPUEstimator} on your local
+machine are relatively minor. The constructor requires two additional arguments.
+You should set the `use_tpu` argument to `False`, and pass a
+@{tf.contrib.tpu.RunConfig} as the `config` argument, as shown below:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config=tf.contrib.tpu.RunConfig()
+    use_tpu=False)
+```
+
+Just this simple change will allow you to run a `TPUEstimator` locally.
+The majority of example TPU models can be run in this local mode,
+by setting the command line flags as follows:
+
+
+```
+$> python mnist_tpu.py --use_tpu=false --master=''
+```
+
+Note: This `use_tpu=False` argument is useful for trying out the `TPUEstimator`
+API. It is not meant to be a complete TPU compatibility test. Successfully
+running a model locally in a `TPUEstimator` does not guarantee that it will
+work on a TPU.
+
+
+### Building a `tpu.RunConfig`
+
+While the default `RunConfig` is sufficient  for local training, these settings
+cannot be ignored in real usage.
+
+A more typical setup for a `RunConfig`, that can be switched to use a Cloud
+TPU, might be as follows:
+
+``` python
+import tempfile
+import subprocess
+
+class FLAGS(object):
+  use_tpu=False
+  tpu_name=None
+  # Use a local temporary path for the `model_dir`
+  model_dir = tempfile.mkdtemp()
+  # Number of training steps to run on the Cloud TPU before returning control.
+  iterations = 50
+  # A single Cloud TPU has 8 shards.
+  num_shards = 8
+
+if FLAGS.use_tpu:
+    my_project_name = subprocess.check_output([
+        'gcloud','config','get-value','project'])
+    my_zone = subprocess.check_output([
+        'gcloud','config','get-value','compute/zone'])
+    cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            tpu_names=[FLAGS.tpu_name],
+            zone=my_zone,
+            project=my_project)
+    master = tpu_cluster_resolver.get_master()
+else:
+    master = ''
+
+my_tpu_run_config = tf.contrib.tpu.RunConfig(
+    master=master,
+    evaluation_master=master,
+    model_dir=FLAGS.model_dir,
+    session_config=tf.ConfigProto(
+        allow_soft_placement=True, log_device_placement=True),
+    tpu_config=tf.contrib.tpu.TPUConfig(FLAGS.iterations,
+                                        FLAGS.num_shards),
+)
+```
+
+Then you must pass the @{tf.contrib.tpu.RunConfig} to the constructor:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu)
+```
+
+Typically the `FLAGS` would be set by command line arguments. To switch from
+training locally to training on a cloud TPU you would need to:
+
+  1) Set `FLAGS.use_tpu` to `True`
+  1) Set `FLAGS.tpu_name` so the
+     `tf.contrib.cluster_resolver.TPUClusterResolver` can find it
+  1) Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`).
+
+
+## Optimizer
+
+When training on a cloud TPU you **must** wrap the optimizer in a
+@{tf.contrib.tpu.CrossShardOptimizer}, which uses an `allreduce` to aggregate
+gradients and broadcast the result to each shard (each TPU core).
+
+The `CrossShardOptimizer` is not compatible with local training. So, to have
+the same code run both locally and on a Cloud TPU, add lines like the following:
+
+``` python
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
+if FLAGS.use_tpu:
+  optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+```
+
+If you prefer to avoid a global `FLAGS` variable in your model code, one
+approach is to set the optimizer as one of the `Estimator`'s params,
+as follows:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu,
+    params={'optimizer':optimizer})
+```
+
+## Model Function
+
+This section details the changes you must make to the model function
+(`model_fn()`) to make it `TPUEstimator` compatible.
+
+### Static shapes
+
+During regular usage TensorFlow attempts to determine the shapes of each
+`tf.Tensor` during graph construction. During execution any unknown shape
+dimensions are determined dynamically,
+see @{$programmers_guide/tensors#shape$Tensor Shapes} for more details.
+
+To run on Cloud TPUs TensorFlow models are compiled using @{$xla$XLA}.
+XLA uses a similar system for determining shapes at compile time. XLA requires
+that all tensor dimensions be statically defined at compile time. All shapes
+must evaluate to a constant, and not depend on external data, or stateful
+operations like variables or a random number generator.
+
+
+### Summaries
+
+Remove any use of `tf.summary` from your model.
+
+@{$summaries_and_tensorboard$TensorBoard summaries} are a great way see inside
+your model. A minimal set of basic summaries are automatically recorded by the
+`TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however,
+are currently unsupported when training on a Cloud TPU. So while the
+`TPUEstimator` will still run locally with summaries, it will fail if used on a
+TPU.
+
+### Metrics
+
+Build your evaluation metrics dictionary in a stand-alone `metric_fn`.
+
+<!-- TODO(markdaoust) link to programmers_guide/metrics when it exists -->
+
+Evaluation metrics are an essential part of training a model. These are fully
+supported on Cloud TPUs, but with a slightly different syntax.
+
+A standard @{tf.metrics} returns two tensors. The first returns the running
+average of the metric value, while the second updates the running average and
+returns the value for this batch:
+
+```
+running_average, current_batch = tf.metrics.accuracy(labels, predictions)
+```
+
+In a standard `Estimator` you create a dictionary of these pairs, and return it
+as part of the `EstimatorSpec`.
+
+```python
+my_metrics = {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.estimator.EstimatorSpec(
+  ...
+  eval_metric_ops=my_metrics
+)
+```
+
+In a `TPUEstimator` you instead pass a function (which returns a metrics
+dictionary) and a list of argument tensors, as shown below:
+
+```python
+def my_metric_fn(labels, predictions):
+   return {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.contrib.tpu.TPUEstimatorSpec(
+  ...
+  eval_metrics=(my_metric_fn, [labels, predictions])
+)
+```
+
+### Use `TPUEstimatorSpec`
+
+`TPUEstimatorSpec` do not support hooks, and require function wrappers for
+some fields.
+
+An `Estimator`'s `model_fn` must return an `EstimatorSpec`. An `EstimatorSpec`
+is a simple structure of named fields containing all the `tf.Tensors` of the
+model that the `Estimator` may need to interact with.
+
+`TPUEstimators` use a @{tf.contrib.tpu.TPUEstimatorSpec}. There are a few
+differences between it and a standard @{tf.estimator.EstimatorSpec}:
+
+
+*  The `eval_metric_ops` must be wrapped into a `metrics_fn`, this field is
+   renamed `eval_metrics` ([see above](#metrics)).
+*  The @{tf.train.SessionRunHook$hooks} are unsupported, so these fields are
+   omitted.
+*  The @{tf.train.Scaffold$`scaffold`}, if used, must also be wrapped in a
+   function. This field is renamed to `scaffold_fn`.
+
+`Scaffold` and `Hooks` are for advanced usage, and can typically be omitted.
+
+## Input functions
+
+Input functions work mainly unchanged as they run on the host computer, not the
+Cloud TPU itself. This section explains the two necessary adjustments.
+
+### Params argument
+
+<!-- TODO(markdaoust) link to input_fn doc when it exists -->
+
+The `input_fn` for a standard `Estimator` _can_ include a
+`params` argument; the `input_fn` for a `TPUEstimator` *must* include a
+`params` argument. This is necessary to allow the estimator to set the batch
+size for each replica of the input stream. So the minimum signature for an
+`input_fn` for a `TPUEstimator` is:
+
+```
+def my_input_fn(params):
+  pass
+```
+
+Where `params['batch-size']` will contain the batch size.
+
+### Static shapes and batch size
+
+The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
+free strict static shape requirements imposed by the XLA/TPU environment. The
+one requirement is that the batches of data fed from your input pipeline to
+the TPU have a static shape, as determined by the standard TensorFlow shape
+inference algorithm. Intermediate tensors are free to have a dynamic shapes.
+If shape inference has failed, but the shape is known it is possible to
+impose the correct shape using `tf.set_shape()`. 
+
+In the example below the shape
+inference algorithm fails, but it is corrected using `set_shape`:
+
+```
+>>> x = tf.zeros(tf.constant([1,2,3])+1)
+>>> x.shape
+
+TensorShape([Dimension(None), Dimension(None), Dimension(None)])
+
+>>> x.set_shape([2,3,4])
+```
+
+In many cases the batch size is the only unknown dimension.
+
+A typical input pipeline, using `tf.data`, will usually produce batches of a
+fixed size. The last batch of a finite `Dataset`, however, is typically smaller,
+containing just the remaining elements. Since a `Dataset` does not know its own
+length or finiteness, the standard @{tf.data.Dataset.batch$`batch`} method
+cannot determine if all batches will have a fixed size batch on its own:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().batch(params['batch-size'])
+>>> ds
+
+<BatchDataset shapes: (?, 3), types: tf.int32>
+```
+
+The most straightforward fix is to
+@{tf.data.Dataset.apply$apply} @{tf.contrib.data.batch_and_drop_remainder}
+as follows:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().apply(
+...     tf.contrib.data.batch_and_drop_remainder(params['batch-size']))
+>>> ds
+
+ <_RestructuredDataset shapes: (32, 3), types: tf.int32>
+```
+
+The one downside to this approach is that, as the name implies, this batching
+method throws out any fractional batch at the end of the dataset. This is fine
+for an infinitely repeating dataset being used for training, but could be a
+problem if you want to train for an exact number of epochs.
+
+To do an exact 1-epoch of _evaluation_ you can work around this by manually
+padding the length of the batches, and setting the padding entries to have zero
+weight when creating your `tf.metrics`.
+
+## Datasets
+
+Efficient use of the `tf.data.Dataset` API is critical when using a Cloud
+TPU, as it is impossible to use the Cloud TPU's unless you can feed it data
+quickly enough. See @{$datasets_performance} for details on dataset performance.
+
+For all but the simplest experimentation (using
+@{tf.data.Dataset.from_tensor_slices} or other in-graph data) you will need to
+store all data files read by the `TPUEstimator`'s `Dataset` in Google Cloud
+Storage Buckets.
+
+<!--TODO(markdaoust): link to the `TFRecord` doc when it exists.-->
+
+For most use-cases, we recommend converting your data into `TFRecord`
+format and using a @{tf.data.TFRecordDataset} to read it. This, however, is not
+a hard requirement and you can use other dataset readers
+(`FixedLengthRecordDataset` or `TextLineDataset`) if you prefer.
+
+Small datasets can be loaded entirely into memory using
+@{tf.data.Dataset.cache}.
+
+Regardless of the data format used, it is strongly recommended that you
+@{$performance_guide#use_large_files$use large files}, on the order of
+100MB. This is especially important in this networked setting as the overhead
+of opening a file is significantly higher.
+
+It is also important, regardless of the type of reader used, to enable buffering
+using the `buffer_size` argument to the constructor. This argument is specified
+in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
+that data is available when needed.
+
+The TPU-demos repo includes
+[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+for downloading the imagenet dataset and converting it to an appropriate format.
+This together with the imagenet
+[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+included in the repo demonstrate all of these best-practices.
+
+
+## What Next
+
+For details on how to actually set up and run a Cloud TPU see:
+
+ * [Google Cloud TPU Documentation](https://cloud.google.com/tpu/docs/)
+
+This document is by no means exhaustive. The best source of more detail on how
+to make a Cloud TPU compatible model are the example models published in:
+
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+
+For more information about tuning TensorFlow code for performance see:
+
+ * The @{$performance$Performance Section.}
+
diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
index 45cb9d829cfbc1b1efb735cc1ea27e33159db724..dae50a8f032bae9421bc01d1ac4043fdaae30080 100644
--- a/tensorflow/docs_src/api_guides/python/regression_examples.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -229,4 +229,4 @@ passed through to the `model_fn` when the `model_fn` is called.
 The `model_fn` returns an
 @{tf.estimator.EstimatorSpec$`EstimatorSpec`} which is a simple structure
 indicating to the `Estimator` which operations should be run to accomplish
-varions tasks.
+various tasks.
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index a3abf2550757e825ae2d023018def919de1bcd8f..d2d3f9edaed9fc3c921a98c95ae24ce168e00216 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -12,7 +12,6 @@ The source code for TensorFlow is on
 Before contributing to TensorFlow source code, please review the
 [Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
 
-
 ### Projects developed by the TensorFlow community
 
 The TensorFlow community has created many great projects around TensorFlow, including:
@@ -65,5 +64,6 @@ please read the following list carefully:
     [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
-
+  * To report vulnerabilities, please follow our
+    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md).
 
diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/get_started/checkpoints.md
index 680e1c0d3f58166a4f6b352816914f5220d84996..dfa2110e691167f54e6ea8b7a832f0a88d0ec41a 100644
--- a/tensorflow/docs_src/get_started/checkpoints.md
+++ b/tensorflow/docs_src/get_started/checkpoints.md
@@ -16,7 +16,7 @@ This document focuses on checkpoints. For details on SavedModel, see the
 ## Sample code
 
 This document relies on the same
-[https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py](Iris classification example) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
+[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
 To download and access the example, invoke the following two commands:
 
 ```shell
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 6343cc4ee454c7242b98497a37e9852b4e9873ae..79c4ee75d01c745d9e492c5db9df11a93eca0477 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -15,7 +15,7 @@ git clone https://github.com/tensorflow/models/
 cd models/samples/core/get_started
 ```
 
-In this document we wil be looking at
+In this document we will be looking at
 [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py).
 You can run it with the following command:
 
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
index ecfbf160f0de2414f6cffa07d159a3e26733e3a6..a8a2ab6e56130c7805d48477301c63d88f87489c 100644
--- a/tensorflow/docs_src/get_started/datasets_quickstart.md
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -169,7 +169,7 @@ the number of examples in the `Dataset` ensures that the data is completely
 shuffled. The Iris data set only contains 150 examples.
 
 The @{tf.data.Dataset.repeat$`repeat`} method has the `Dataset` restart when
-it reaches the end. To limit the number of epochss, set the `count` argument.
+it reaches the end. To limit the number of epochs, set the `count` argument.
 
 The @{tf.data.Dataset.repeat$`batch`} method collects a number of examples and
 stacks them, to create batches. This adds a dimension to their shape. The new
@@ -282,7 +282,7 @@ produce the necessary `(features, label)` pairs.
 
 We will start by building a function to parse a single line.
 
-The following `iris_data.parse_line` function acomplishes this taks using the
+The following `iris_data.parse_line` function accomplishes this task using the
 @{tf.decode_csv} function, and some simple python code:
 
 We must parse each of the lines in the dataset in order to generate the
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
index e3308ed716d63f10bf0e9dda858c23eef30709a6..ad3e1fe3e3a4e3f5278e76bcaa0fc8eee2faf374 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -461,8 +461,8 @@ permitting a richer palette of numbers for every cell, an embedding column
 contains far fewer cells than an indicator column.
 
 Let's look at an example comparing indicator and embedding columns. Suppose our
-input examples consists of different words from a limited palette of only 81
-words. Further suppose that the data set provides provides the following input
+input examples consist of different words from a limited palette of only 81
+words. Further suppose that the data set provides the following input
 words in 4 separate examples:
 
 * `"dog"`
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index dbc35065abf22c88c325c4edc370b6da91c4df5b..4f01f997c33c211e8cff81b6b268bb320aa794df 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -2,37 +2,39 @@
 # Getting Started with TensorFlow
 
 This document introduces the TensorFlow programming environment and shows you
-how to write the Iris classification problem in TensorFlow.
+how to solve the Iris classification problem in TensorFlow.
 
-Prior to reading this document, do the following:
+## Prerequisites
+
+Prior to using the sample code in this document, you'll need to do the
+following:
 
 * @{$install$Install TensorFlow}.
 * If you installed TensorFlow with virtualenv or Anaconda, activate your
   TensorFlow environment.
-* To keep the data import simple, our Iris example uses Pandas. You can
-  install Pandas with:
+* Install or upgrade pandas by issuing the following command:
 
-      `pip install pandas`
+        pip install pandas
 
 ## Getting the sample code
 
-Take the following steps to get the sample code for this program:
+Take the following steps to get the sample code we'll be going through:
 
-1. Clone the TensorFlow Models repository from github by entering the following
+1. Clone the TensorFlow Models repository from GitHub by entering the following
    command:
 
-       `git clone https://github.com/tensorflow/models`
+        git clone https://github.com/tensorflow/models
 
 1. Change directory within that branch to the location containing the examples
    used in this document:
 
-       `cd models/samples/core/get_started/`
+        cd models/samples/core/get_started/
 
 The program described in this document is
 [`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
 This program uses
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
-To fetch its training data.
+to fetch its training data.
 
 ### Running the program
 
@@ -45,7 +47,7 @@ python premade_estimator.py
 The program should output training logs followed by some predictions against
 the test set. For example, the first line in the following output shows that
 the model thinks there is a 99.6% chance that the first example in the test
-set is a Setosa. Since the test set `expected "Setosa"`, this appears to be
+set is a Setosa. Since the test set expected Setosa, this appears to be
 a good prediction.
 
 ``` None
@@ -61,9 +63,9 @@ If the program generates errors instead of answers, ask yourself the following
 questions:
 
 * Did you install TensorFlow properly?
-* Are you using the correct version of tensorflow?
+* Are you using the correct version of TensorFlow?
 * Did you activate the environment you installed TensorFlow in? (This is
-  only relevant in certain installation environments.)
+  only relevant in certain installation mechanisms.)
 
 ## The programming stack
 
@@ -74,18 +76,15 @@ provides a programming stack consisting of multiple API layers:
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="../images/tensorflow_programming_environment.png">
 </div>
-<div style="text-align: center">
-The TensorFlow Programming Environment
-</div>
 
 We strongly recommend writing TensorFlow programs with the following APIs:
 
-* @{tf.estimator$Estimators}, which represent a complete model.
+* @{$programmers_guide/estimators$Estimators}, which represent a complete model.
   The Estimator API provides methods to train the model, to judge the model's
   accuracy, and to generate predictions.
 * @{$get_started/datasets_quickstart$Datasets}, which build a data input
   pipeline. The Dataset API has methods to load and manipulate data, and feed
-  it into your model. The Datasets API meshes well with the Estimators API.
+  it into your model. The Dataset API meshes well with the Estimators API.
 
 ## Classifying irises: an overview
 
@@ -120,7 +119,7 @@ individual Iris flowers:
 * petal length
 * petal width
 
-Our model will represent these features as float32 numerical data.
+Our model will represent these features as `float32` numerical data.
 
 The label identifies the Iris species, which must be one of the following:
 
@@ -154,9 +153,6 @@ The following figure illustrates the features, hidden layers, and predictions
   alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
   src="../images/custom_estimators/full_network.png">
 </div>
-<div style="text-align: center">
-The Model.
-</div>
 
 ### Inference
 
@@ -174,12 +170,12 @@ example is an Iris Versicolor.
 
 ## Overview of programming with Estimators
 
-An Estimator is TensorFlow's high level representation of a complete model. It
+An Estimator is TensorFlow's high-level representation of a complete model. It
 handles the details of initialization, logging, saving and restoring, and many
 other features so you can concentrate on your model. For more details see
 @{$programmers_guide/estimators}.
 
-An "Estimator" is any class derived from @{tf.estimator.Estimator}. TensorFlow
+An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow
 provides a collection of
 [pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
@@ -199,7 +195,7 @@ following tasks:
 * Call one or more methods on the Estimator object, passing the appropriate
   input function as the source of the data.
 
-Let's see how those tasks are implemented in Iris.
+Let's see how those tasks are implemented for Iris classification.
 
 ## Create input functions
 
@@ -209,17 +205,30 @@ evaluating, and prediction.
 An **input function** is a function that returns a @{tf.data.Dataset} object
 which outputs the following two-element tuple:
 
-* "features" - A Python dictionary in which:
+* [`features`](https://developers.google.com/machine-learning/glossary/#feature) - A Python dictionary in which:
     * Each key is the name of a feature.
     * Each value is an array containing all of that feature's values.
-* "label" - An array containing the values of the
+* `label` - An array containing the values of the
   [label](https://developers.google.com/machine-learning/glossary/#label) for
   every example.
 
-Your input function may generate the "features" dictionary and "label" list any
-way you like. However, we recommend using TensorFlow's @{tf.data.Dataset} API,
-which can deftly parse all sorts of data. At a high-level,
-the @{tf.data.Dataset} API consists of the following classes:
+Just to demonstrate the format of the input function, here's a simple
+implementation:
+
+```python
+def input_evaluation_set():
+    features = {'SepalLength': np.array([6.4, 5.0]),
+                'SepalWidth':  np.array([2.8, 2.3]),
+                'PetalLength': np.array([5.6, 3.3]),
+                'PetalWidth':  np.array([2.2, 1.0])}
+    labels = np.array([2, 1])
+    return features, labels
+```
+
+Your input function may generate the `features` dictionary and `label` list any
+way you like. However, we recommend using TensorFlow's Dataset API, which can
+parse all sorts of data. At a high level, the Dataset API consists of the
+following classes:
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%"
@@ -227,23 +236,23 @@ the @{tf.data.Dataset} API consists of the following classes:
   src="../images/dataset_classes.png">
 </div>
 
+Where the individual members are:
 
-Where:
-
-* Dataset: Base class containing methods to create and transform datasets. Also
-  allows you to initialize a dataset from data in memory, or from a Python
-  generator.
-* TextLineDataset: Reads lines from text files.
-* TFRecordDataset: Reads records from TFRecord files.
-* FixedLengthRecordDataset: Reads fixed size records from binary files.
-* Iterator: Provides a way to access one data set element at a time.
+* `Dataset` - Base class containing methods to create and transform
+  datasets. Also allows you to initialize a dataset from data in memory, or from
+  a Python generator.
+* `TextLineDataset` - Reads lines from text files.
+* `TFRecordDataset` - Reads records from TFRecord files.
+* `FixedLengthRecordDataset` - Reads fixed size records from binary files.
+* `Iterator` - Provides a way to access one data set element at a time.
 
 The Dataset API can handle a lot of common cases for you. For example,
 using the Dataset API, you can easily read in records from a large collection
 of files in parallel and join them into a single stream.
 
-To keep things simple in this example we are going to load the data with pandas,
-and build our input pipeline from this in-memory data.
+To keep things simple in this example we are going to load the data with
+[pandas](https://pandas.pydata.org/), and build our input pipeline from this
+in-memory data.
 
 Here is the input function used for training in this program, which is available
 in [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py):
@@ -258,9 +267,9 @@ def train_input_fn(features, labels, batch_size):
     return dataset.shuffle(1000).repeat().batch(batch_size)
 ```
 
-## Define the Feature Columns
+## Define the feature columns
 
-A [**Feature Column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
+A [**feature column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
 is an object describing how the model should use raw input data from the
 features dictionary. When you build an Estimator model, you pass it a list of
 feature columns that describes each of the features you want the model to use.
@@ -270,7 +279,7 @@ to the model.
 For Iris, the 4 raw features are numeric values, so we'll build a list of
 feature columns to tell the Estimator model to represent each of the four
 features as 32-bit floating-point values. Therefore, the code to create the
-Feature Column is simply:
+feature column is:
 
 ```python
 # Feature columns describe how to use the input.
@@ -279,29 +288,29 @@ for key in train_x.keys():
     my_feature_columns.append(tf.feature_column.numeric_column(key=key))
 ```
 
-Feature Columns can be far more sophisticated than those we're showing here.
-We detail feature columns @{$get_started/feature_columns$later on} in
-getting started.
+Feature columns can be far more sophisticated than those we're showing here.  We
+detail feature columns @{$get_started/feature_columns$later on} in our Getting
+Started guide.
 
 Now that we have the description of how we want the model to represent the raw
 features, we can build the estimator.
 
 
-## Instantiate an Estimator
+## Instantiate an estimator
 
 The Iris problem is a classic classification problem. Fortunately, TensorFlow
 provides several pre-made classifier Estimators, including:
 
-* @{tf.estimator.DNNClassifier}—for deep models that perform multi-class
+* @{tf.estimator.DNNClassifier} for deep models that perform multi-class
   classification.
-* @{tf.estimator.DNNLinearCombinedClassifier}—for wide-n-deep models.
-* @{tf.estimator.LinearClassifier}— for classifiers based on linear models.
+* @{tf.estimator.DNNLinearCombinedClassifier} for wide & deep models.
+* @{tf.estimator.LinearClassifier} for classifiers based on linear models.
 
 For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice.
 Here's how we instantiated this Estimator:
 
 ```python
-# Build 2 hidden layer DNN with 10, 10 units respectively.
+# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer.
 classifier = tf.estimator.DNNClassifier(
     feature_columns=my_feature_columns,
     # Two hidden layers of 10 nodes each.
@@ -363,7 +372,7 @@ Test set accuracy: 0.967
 
 We now have a trained model that produces good evaluation results.
 We can now use the trained model to predict the species of an Iris flower
-based on some unlabeled measurments. As with training and evaluation, we make
+based on some unlabeled measurements. As with training and evaluation, we make
 predictions using a single function call:
 
 ```python
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index ba1a4118aece1f42822f7cd084feed50c5cf6ebb..a783205b4a2d24182de6496e0173635990120185 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 87cc647317a11fab0d9d0219dd5764af3dcb2ecc..5249e04615b506186a12807bb71ec4079db8156c 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 37e109a6e4bdee97ad02bc7aceb2c0c24e1ec7ec..0c6c773e62483b2272cf3b80da0932b4b800bb71 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.5.0-rc1</version>
+                 <version>1.6.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.5.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 03f12dff08cb3483666df4b8553b97fc1c4f34f9..105b225177315db07b1117c3ece4b77dd2b60cb2 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -31,13 +31,13 @@ If you are installing TensorFlow with GPU support using one of the
 mechanisms described in this guide, then the following NVIDIA software
 must be installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Ensure that you create the `CUDA_HOME` environment variable as
     described in the NVIDIA documentation.
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index e13ddadab7b0e2ed96bdaf5600b3479a4b5eec55..a6ea548cfbdb3070c19b5c19ebc903ca76a4656a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index f494cc7a7c0575fd7950b6fe28d7671e1f25725f..36dffd85dced6b18b6c073856df681006fcb075e 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -133,7 +133,7 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 8.0.
+  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0.
     For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
@@ -272,8 +272,6 @@ Found possible Python library paths:
 Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
 
 Using python library path: /usr/local/lib/python2.7/dist-packages
-Do you wish to build TensorFlow with MKL support? [y/N]
-No MKL support will be enabled for TensorFlow
 Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
 Do you wish to use jemalloc as the malloc implementation? [Y/n]
 jemalloc enabled
@@ -291,11 +289,11 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: <b>8.0</b>
-Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: <b>6</b>
-Please specify the location where cuDNN 6 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
+Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
@@ -361,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.5.0rc1 on Linux:
+for TensorFlow 1.6.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -462,9 +460,10 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -480,7 +479,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -493,8 +493,10 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 8d0eb7966fdf17be1c259627a64803f0a392943a..86a111c2ec1ee1e126b3e96c83eae68da7da7e41 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -30,13 +30,13 @@ If you are installing TensorFlow with GPU support using one of the mechanisms
 described in this guide, then the following NVIDIA software must be
 installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's
     documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/)
     Ensure that you append the relevant Cuda pathnames to the `%PATH%`
     environment variable as described in the NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 10e7ad7ada533c8da5e5b871b38809b90604685e..cd47fc2803bc1429d28bd0ae4c2ad68e632a6f03 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -498,7 +498,7 @@ For TensorFlow source versions after 1.3.0:
 ```bash
 ./configure
 # Pick the desired options
-bazel build --config=mkl -c opt //tensorflow/tools/pip_package:build_pip_package
+bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package
 
 ```
 
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 2b4896c381052b5a3fb97385a18dbff82c2c0d89..9049a5a9f3d44e255188c6c41cdb12a619464379 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -125,14 +125,14 @@ an operation:
   @{tf.Tensor} accepts an optional `name` argument. For example,
   `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named
   `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph
-  already contained an operation named `"answer"`, the TensorFlow would append
+  already contains an operation named `"answer"`, then TensorFlow would append
   `"_1"`, `"_2"`, and so on to the name, in order to make it unique.
 
 * The @{tf.name_scope} function makes it possible to add a **name scope** prefix
   to all operations created in a particular context. The current name scope
   prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope}
   context managers. If a name scope has already been used in the current
-  context, TensorFlow appens `"_1"`, `"_2"`, and so on. For example:
+  context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example:
 
   ```python
   c_0 = tf.constant(0, name="c")  # => operation named "c"
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 9f50be5b31cd8b61b81426f50aa9ef9beb3138f2..f27a658342b8d33407e1c6ed5799a10c2305a74c 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -285,7 +285,7 @@ with tf.Session(graph=tf.Graph()) as sess:
 ```
 
 
-### Loading a Savedmodel in C++
+### Loading a SavedModel in C++
 
 The C++ version of the SavedModel
 [loader](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/loader.h)
@@ -303,6 +303,30 @@ LoadSavedModel(session_options, run_options, export_dir, {kSavedModelTagTrain},
                &bundle);
 ```
 
+### Loading and Serving a SavedModel in TensorFlow Serving
+
+You can easily load and serve a SavedModel with the TensorFlow Serving Model
+Server binary. See [instructions](https://www.tensorflow.org/serving/setup#installing_using_apt-get)
+on how to install the server, or build it if you wish.
+
+Once you have the Model Server, run it with:
+```
+tensorflow_model_server --port=port-numbers --model_name=your-model-name --model_base_path=your_model_base_path
+```
+Set the port and model_name flags to values of your choosing. The
+model_base_path flag expects to be to a base directory, with each version of
+your model residing in a numerically named subdirectory. If you only have a
+single version of your model, simply place it in a subdirectory like so:
+* Place the model in /tmp/model/0001
+* Set model_base_path to /tmp/model
+
+Store different versions of your model in numerically named subdirectories of a
+common base directory. For example, suppose the base directory is `/tmp/model`.
+If you have only one version of your model, store it in `/tmp/model/0001`. If
+you have two versions of your model, store the second version in
+`/tmp/model/0002`, and so on.  Set the `--model-base_path` flag to the base
+directory (`/tmp/model`, in this example).  TensorFlow Model Server will serve
+the model in the highest numbered subdirectory of that base directory.
 
 ### Standard constants
 
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index f7bdf8b816a8191770bc1ad59b890041b8e39912..0767726aa9a248fb073fbd4114f47d1b4ed6901b 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -56,10 +56,12 @@ def nativeOutDir = 'libs/' + cpuType
 def nativeBuildRule = 'buildNativeBazel'
 def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
+
+// Override for Makefile builds.
 if (nativeBuildSystem == 'makefile') {
     nativeBuildRule = 'buildNativeMake'
-    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
-    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
+    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_demo.so'
+    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_inference.so'
 }
 
 // If building with Bazel, this is the location of the bazel binary.
@@ -154,7 +156,8 @@ task buildNativeMake(type: Exec) {
          '-s',  \
          'tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in',  \
          '-t',  \
-         'libtensorflow_inference.so libtensorflow_demo.so'  \
+         'libtensorflow_inference.so libtensorflow_demo.so all'  \
+         , '-a', cpuType  \
          //, '-T'  // Uncomment to skip protobuf and speed up subsequent builds.
 }
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
index a317273acdff016c824031e06c413ecc01f82ec8..bc0c738e53739a516bef268e6551cbb7741a6298 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -81,8 +81,11 @@ public class LegacyCameraConnectionFragment extends Fragment {
 
           try {
             Camera.Parameters parameters = camera.getParameters();
-            parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
-
+            List<String> focusModes = parameters.getSupportedFocusModes();
+            if (focusModes != null 
+                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
+              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+            }
             List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
             Size[] sizes = new Size[cameraSizes.size()];
             int i = 0;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 2fe2ba539edc84e80baf36b6d1ac1e192bc92163..af6af2bc8f508a70aa7e44a7236f0e7ea5e3d71c 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -199,7 +199,7 @@ public class MultiBoxTracker {
       final int w,
       final int h,
       final int rowStride,
-      final int sensorOrienation,
+      final int sensorOrientation,
       final byte[] frame,
       final long timestamp) {
     if (objectTracker == null && !initialized) {
@@ -209,7 +209,7 @@ public class MultiBoxTracker {
       objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
       frameWidth = w;
       frameHeight = h;
-      this.sensorOrientation = sensorOrienation;
+      this.sensorOrientation = sensorOrientation;
       initialized = true;
 
       if (objectTracker == null) {
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index fa4c1c0da5f31863aa4d99b6ec84e1e50e1a1551..461fb1c5173f66278eb585d30bd8749a58fb6245 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Train and Eval the MNIST network.
 
 This version is like fully_connected_feed.py but uses data converted
@@ -65,6 +64,7 @@ def decode(serialized_example):
 
   return image, label
 
+
 def augment(image, label):
   # OPTIONAL: Could reshape into a 28x28 image and apply distortions
   # here.  Since we are not applying any distortions in this
@@ -72,12 +72,14 @@ def augment(image, label):
   # into a vector, we don't bother.
   return image, label
 
+
 def normalize(image, label):
   # Convert from [0, 255] -> [-0.5, 0.5] floats.
   image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
 
   return image, label
 
+
 def inputs(train, batch_size, num_epochs):
   """Reads input data num_epochs times.
 
@@ -98,9 +100,10 @@ def inputs(train, batch_size, num_epochs):
     over the dataset once. On the other hand there is no special initialization
     required.
   """
-  if not num_epochs: num_epochs = None
-  filename = os.path.join(FLAGS.train_dir,
-                          TRAIN_FILE if train else VALIDATION_FILE)
+  if not num_epochs:
+    num_epochs = None
+  filename = os.path.join(FLAGS.train_dir, TRAIN_FILE
+                          if train else VALIDATION_FILE)
 
   with tf.name_scope('input'):
     # TFRecordDataset opens a protobuf and reads entries line by line
@@ -127,13 +130,11 @@ def run_training():
   # Tell TensorFlow that the model will be built into the default Graph.
   with tf.Graph().as_default():
     # Input images and labels.
-    image_batch, label_batch = inputs(train=True, batch_size=FLAGS.batch_size,
-                               num_epochs=FLAGS.num_epochs)
+    image_batch, label_batch = inputs(
+        train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs)
 
     # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(image_batch,
-                             FLAGS.hidden1,
-                             FLAGS.hidden2)
+    logits = mnist.inference(image_batch, FLAGS.hidden1, FLAGS.hidden2)
 
     # Add to the Graph the loss calculation.
     loss = mnist.loss(logits, label_batch)
@@ -152,7 +153,7 @@ def run_training():
       sess.run(init_op)
       try:
         step = 0
-        while True: #train until OutOfRangeError
+        while True:  #train until OutOfRangeError
           start_time = time.time()
 
           # Run one step of the model.  The return values are
@@ -168,10 +169,12 @@ def run_training():
           # Print an overview fairly often.
           if step % 100 == 0:
             print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
-                                                     duration))
+                                                       duration))
           step += 1
       except tf.errors.OutOfRangeError:
-        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
+        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs,
+                                                          step))
+
 
 def main(_):
   run_training()
@@ -183,37 +186,27 @@ if __name__ == '__main__':
       '--learning_rate',
       type=float,
       default=0.01,
-      help='Initial learning rate.'
-  )
+      help='Initial learning rate.')
   parser.add_argument(
       '--num_epochs',
       type=int,
       default=2,
-      help='Number of epochs to run trainer.'
-  )
+      help='Number of epochs to run trainer.')
   parser.add_argument(
       '--hidden1',
       type=int,
       default=128,
-      help='Number of units in hidden layer 1.'
-  )
+      help='Number of units in hidden layer 1.')
   parser.add_argument(
       '--hidden2',
       type=int,
       default=32,
-      help='Number of units in hidden layer 2.'
-  )
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      help='Batch size.'
-  )
+      help='Number of units in hidden layer 2.')
+  parser.add_argument('--batch_size', type=int, default=100, help='Batch size.')
   parser.add_argument(
       '--train_dir',
       type=str,
       default='/tmp/data',
-      help='Directory with the training data.'
-  )
+      help='Directory with the training data.')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
index d62b73384c4969dc56a2f91d89719ba02a8f9431..1c1bd57d715ae39539d9cec1347caa702a8261b9 100644
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@@ -23,6 +23,7 @@ import sys
 import numpy as np
 import tensorflow as tf
 
+
 def load_graph(model_file):
   graph = tf.Graph()
   graph_def = tf.GraphDef()
@@ -34,22 +35,26 @@ def load_graph(model_file):
 
   return graph
 
-def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
-				input_mean=0, input_std=255):
+
+def read_tensor_from_image_file(file_name,
+                                input_height=299,
+                                input_width=299,
+                                input_mean=0,
+                                input_std=255):
   input_name = "file_reader"
   output_name = "normalized"
   file_reader = tf.read_file(file_name, input_name)
   if file_name.endswith(".png"):
-    image_reader = tf.image.decode_png(file_reader, channels = 3,
-                                       name='png_reader')
+    image_reader = tf.image.decode_png(
+        file_reader, channels=3, name="png_reader")
   elif file_name.endswith(".gif"):
-    image_reader = tf.squeeze(tf.image.decode_gif(file_reader,
-                                                  name='gif_reader'))
+    image_reader = tf.squeeze(
+        tf.image.decode_gif(file_reader, name="gif_reader"))
   elif file_name.endswith(".bmp"):
-    image_reader = tf.image.decode_bmp(file_reader, name='bmp_reader')
+    image_reader = tf.image.decode_bmp(file_reader, name="bmp_reader")
   else:
-    image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
-                                        name='jpeg_reader')
+    image_reader = tf.image.decode_jpeg(
+        file_reader, channels=3, name="jpeg_reader")
   float_caster = tf.cast(image_reader, tf.float32)
   dims_expander = tf.expand_dims(float_caster, 0)
   resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
@@ -59,6 +64,7 @@ def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
 
   return result
 
+
 def load_labels(label_file):
   label = []
   proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
@@ -66,6 +72,7 @@ def load_labels(label_file):
     label.append(l.rstrip())
   return label
 
+
 if __name__ == "__main__":
   file_name = "tensorflow/examples/label_image/data/grace_hopper.jpg"
   model_file = \
@@ -110,11 +117,12 @@ if __name__ == "__main__":
     output_layer = args.output_layer
 
   graph = load_graph(model_file)
-  t = read_tensor_from_image_file(file_name,
-                                  input_height=input_height,
-                                  input_width=input_width,
-                                  input_mean=input_mean,
-                                  input_std=input_std)
+  t = read_tensor_from_image_file(
+      file_name,
+      input_height=input_height,
+      input_width=input_width,
+      input_mean=input_mean,
+      input_std=input_std)
 
   input_name = "import/" + input_layer
   output_name = "import/" + output_layer
@@ -122,8 +130,9 @@ if __name__ == "__main__":
   output_operation = graph.get_operation_by_name(output_name)
 
   with tf.Session(graph=graph) as sess:
-    results = sess.run(output_operation.outputs[0],
-                      {input_operation.outputs[0]: t})
+    results = sess.run(output_operation.outputs[0], {
+        input_operation.outputs[0]: t
+    })
   results = np.squeeze(results)
 
   top_k = results.argsort()[-5:][::-1]
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index eb117c39a122f4f6c108dd18f8f8035edf05eaa1..e4e61862b02f9827f42c8d0052a7be8a57502dd8 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -34,8 +34,7 @@ MAX_LABEL = 15
 WORDS_FEATURE = 'words'  # Name of the input words feature.
 
 
-def estimator_spec_for_softmax_classification(
-    logits, labels, mode):
+def estimator_spec_for_softmax_classification(logits, labels, mode):
   """Returns EstimatorSpec instance for softmax classification."""
   predicted_classes = tf.argmax(logits, 1)
   if mode == tf.estimator.ModeKeys.PREDICT:
@@ -53,8 +52,8 @@ def estimator_spec_for_softmax_classification(
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 
   eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
+      'accuracy':
+          tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
   }
   return tf.estimator.EstimatorSpec(
       mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
@@ -67,8 +66,7 @@ def bag_of_words_model(features, labels, mode):
   bow_embedding_column = tf.feature_column.embedding_column(
       bow_column, dimension=EMBEDDING_SIZE)
   bow = tf.feature_column.input_layer(
-      features,
-      feature_columns=[bow_embedding_column])
+      features, feature_columns=[bow_embedding_column])
   logits = tf.layers.dense(bow, MAX_LABEL, activation=None)
 
   return estimator_spec_for_softmax_classification(
@@ -110,9 +108,9 @@ def main(unused_argv):
   # Prepare training and testing data
   dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.Series(dbpedia.train.data[:,1])
+  x_train = pandas.Series(dbpedia.train.data[:, 1])
   y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.Series(dbpedia.test.data[:,1])
+  x_test = pandas.Series(dbpedia.test.data[:, 1])
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
@@ -152,10 +150,7 @@ def main(unused_argv):
 
   # Predict.
   test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_test},
-      y=y_test,
-      num_epochs=1,
-      shuffle=False)
+      x={WORDS_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
   y_predicted = y_predicted.reshape(np.array(y_test).shape)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index fb3ac942039e670fb5ca975c5d9835ba065190a2..47dd6a1947811765101529826c2b24d9798fef1f 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A very simple MNIST classifier.
 
 See extensive documentation at
@@ -67,12 +66,19 @@ def main(_):
   # Test trained model
   correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  print(sess.run(accuracy, feed_dict={x: mnist.test.images,
-                                      y_: mnist.test.labels}))
+  print(sess.run(
+      accuracy, feed_dict={
+          x: mnist.test.images,
+          y_: mnist.test.labels
+      }))
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
+  parser.add_argument(
+      '--data_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory for storing input data')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index d055d157454d4cb351e8db59eec484f212893fe5..f6906b0f79b86910b5354bea420d00f62ff0caf8 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -270,12 +270,6 @@ with tf.Session(graph=graph) as session:
         run_metadata=run_metadata)
     average_loss += loss_val
 
-    # Add returned summaries to writer in each step.
-    writer.add_summary(summary, step)
-    # Add metadata to visualize the graph for the last run.
-    if step == (num_steps - 1):
-      writer.add_run_metadata(run_metadata, 'step%d' % step)
-
     # Add returned summaries to writer in each step.
     writer.add_summary(summary, step)
     # Add metadata to visualize the graph for the last run.
diff --git a/tensorflow/examples/udacity/Dockerfile b/tensorflow/examples/udacity/Dockerfile
index 3ca58566c1ddb4c2446f7d9b19ee31fb8b603909..00eb853e527c922121fae6dc5eab42c589b0b238 100644
--- a/tensorflow/examples/udacity/Dockerfile
+++ b/tensorflow/examples/udacity/Dockerfile
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN pip install scikit-learn pyreadline Pillow
+RUN pip install scikit-learn pyreadline Pillow imageio
 RUN rm -rf /notebooks/*
 ADD *.ipynb /notebooks/
 WORKDIR /notebooks
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index fc087d9d995dfe031e61fd0fa15d649c2ee35cc9..08943a527cbdc072b12b066240c213be45ffd54c 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -173,7 +173,11 @@ type OpSpec struct {
 	// operation.
 	Attrs map[string]interface{}
 
-	// Other possible fields: Device, ColocateWith, ControlInputs.
+	// Operations that must be executed before executing the operation
+	// being added.
+	ControlDependencies []*Operation
+
+	// Other possible fields: Device, ColocateWith.
 }
 
 // AddOperation adds an operation to g.
@@ -204,6 +208,9 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			}
 		}
 	}
+	for _, in := range args.ControlDependencies {
+		C.TF_AddControlInput(cdesc, in.c)
+	}
 	status := newStatus()
 	for name, value := range args.Attrs {
 		if err := setAttr(cdesc, status, name, value); err != nil {
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index a9ec79463a00022bf85bf00032df9004648525ae..13de4294dc2ebdfff9bb68d277c09239d0bc8593 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -33,10 +33,11 @@ import (
 // A Scope object and all its derivates (e.g., obtained from Scope.SubScope)
 // are not safe for concurrent use by multiple goroutines.
 type Scope struct {
-	graph     *tf.Graph
-	namemap   map[string]int
-	namespace string
-	err       *scopeErr
+	graph               *tf.Graph
+	namemap             map[string]int
+	namespace           string
+	controlDependencies []*tf.Operation
+	err                 *scopeErr
 }
 
 // scopeErr is used to share errors between all derivatives of a root scope.
@@ -80,6 +81,7 @@ func (s *Scope) AddOperation(args tf.OpSpec) *tf.Operation {
 	if s.namespace != "" {
 		args.Name = s.namespace + "/" + args.Name
 	}
+	args.ControlDependencies = append(args.ControlDependencies, s.controlDependencies...)
 	op, err := s.graph.AddOperation(args)
 	if err != nil {
 		s.UpdateErr(args.Type, err)
@@ -103,6 +105,28 @@ func (s *Scope) SubScope(namespace string) *Scope {
 	}
 }
 
+// WithControlDependencies returns a new Scope which will cause all operations
+// added to the graph to execute only after all the provided operations have
+// executed first (in addition to any other control dependencies in s).
+func (s *Scope) WithControlDependencies(ops ...*tf.Operation) *Scope {
+	// Force a copy of the control dependencies into a new underlying array on
+	// every call.  We cannot alias the same underlying array as `ops`, otherwise
+	// the user could modify that array after calling s.WithControlDependencies,
+	// which would be confusing.  We cannot alias the same underlying array as the
+	// original `s.controlDependencies`, since Scopes form a logical tree, and
+	// other calls to s.WithControlDependencies could stomp on each other.
+	deps := make([]*tf.Operation, 0, len(s.controlDependencies)+len(ops))
+	deps = append(deps, s.controlDependencies...)
+	deps = append(deps, ops...)
+	return &Scope{
+		graph:               s.graph,
+		namemap:             s.namemap,
+		namespace:           s.namespace,
+		controlDependencies: deps,
+		err:                 s.err,
+	}
+}
+
 // Err returns the error, if any, encountered during the construction
 // of the Graph managed by s.
 //
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 6fb5d32e503c7c9a5a48747844da15be81b1de2d..b58a61de98b0f5b04959e1eca35c6b6c4d77e42b 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -69,6 +69,49 @@ func TestScopeSubScopeErrors(t *testing.T) {
 	}
 }
 
+func TestControlDependencies(t *testing.T) {
+	var (
+		s        = NewScope()
+		zero     = Const(s.SubScope("zero"), int32(0))
+		one      = Const(s.SubScope("one"), int32(1))
+		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
+		init     = AssignVariableOp(s, variable, zero)
+		update   = AssignAddVariableOp(s, variable, one)
+		readDeps = []*tf.Operation{update}
+	)
+	// We intend for `read` to have a control dependency on `update`.
+	s = s.WithControlDependencies(readDeps...)
+	// Ensure that Scope.WithControlDependencies makes a copy of the underlying
+	// array, rather than just holding a slice reference to the same user-supplied
+	// underlying array.  If the copy is correctly performed, overwriting
+	// readDeps[0] should have no effect on control dependencies for `read`.
+	readDeps[0] = init
+	read := ReadVariableOp(s, variable, tf.Int32)
+
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err = sess.Run(nil, nil, []*tf.Operation{init}); err != nil {
+		t.Fatal(err)
+	}
+	// Without the control dependency, the read operation may not see the
+	// update.
+	for i := int32(0); i < 10; i++ {
+		out, err := sess.Run(nil, []tf.Output{read}, nil)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if got, want := out[0].Value().(int32), i+1; got != want {
+			t.Errorf("Got %d, want %d", got, want)
+		}
+	}
+}
+
 func TestScopeFinalize(t *testing.T) {
 	var (
 		root = NewScope()
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 5b19c90238ef3bb1361a5e2476e94dd06e76d128..cb47651d7b3199cc804b2a3e89aaf2cead7b75c1 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -8729,31 +8729,6 @@ func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
@@ -21290,6 +21265,31 @@ func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output
 	return op.Output(0)
 }
 
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Performs a padding as a preprocess during a convolution.
 //
 // Similar to FusedResizeAndPadConv2d, this op allows for an optimized
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 6285ee0483d9171d6cdb9b4dbf2675bafb953038..a9ce5372aeb32b6957359fdcaa9da01c732c9f9f 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index b0e5c44fecc9bf3a95ac3d4e36d9f98d74d3b2bb..fe34ca83ff30373fa4f3c4f345323bad40a8754e 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 02c5dca13f4d292718afca7e99bac82710e1949f..390152808eb0c4abebe093ed5db39faf37fcafe3 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 949597ca7f1e7a05cf6c0e5a15cb5307b00859a1..524ec45f48bb91d09dfb5fca3cc19256d45587fb 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 9f0ebcf84c9c8e01662a93034a4407c6b58a6d7e..9cf3217f51f73184a02a58ead1a2735c5a44fd26 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 88d897362ad6c8f84d93cbc9bcf3c30905b345be..d619f986a9a03ac67f5de6bbe80e686a05ce5d42 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 01b3e92d2d9edc12afc6c98da44a4442796592e9..ca79383d43931760414350634967b5ab00ffbf2f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -76,6 +76,7 @@ py_library(
         ":layers",
         ":lib",
         ":list_ops",
+        ":manip_ops",
         ":math_ops",
         ":metrics",
         ":nn",
@@ -298,6 +299,7 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1394,6 +1396,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "manip_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
@@ -1726,6 +1736,8 @@ py_library(
         ":linalg_grad",
         ":linalg_ops",
         ":logging_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":platform",
@@ -1848,6 +1860,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "manip_grad",
+    srcs = ["ops/manip_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":manip_ops",
+    ],
+)
+
+py_library(
+    name = "manip_ops",
+    srcs = ["ops/manip_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":manip_ops_gen",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "logging_ops",
     srcs = ["ops/logging_ops.py"],
@@ -2310,6 +2345,8 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
@@ -2668,6 +2705,7 @@ cuda_py_test(
         ":nn_ops_gen",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
     tags = ["no_windows"],
 )
 
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index bc9ddec2a54a784027120828e9b15a2bf500414e..ea7604d30f127e51d573a5251420faa22cbb3468 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -84,6 +84,7 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import image_ops as image
+from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
@@ -241,6 +242,7 @@ _allowed_symbols.extend([
     'linalg',
     'logging',
     'losses',
+    'manip',
     'metrics',
     'newaxis',
     'nn',
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index 7bba10efacfbc7fbde402c665b3d55d852e36eae..aaf41626ab0078489026036d2b838f33a893a540 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,8 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
+      self.assertTrue("GPU" in [d.device_type for d in devices] or
+                      "SYCL" in [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/notebook.py b/tensorflow/python/client/notebook.py
index 8babe35b3230e7b46c0c9484ccddae4e5e22a335..4b6a0f71ae65aa28b70dd22ce6cffa82e9bc5973 100644
--- a/tensorflow/python/client/notebook.py
+++ b/tensorflow/python/client/notebook.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Notebook front-end to TensorFlow.
 
 When you run this binary, you'll see something like below, which indicates
@@ -43,10 +42,8 @@ from tensorflow.python.platform import app
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION"] = "2"
 
-
 FLAGS = None
 
-
 ORIG_ARGV = sys.argv
 # Main notebook process calls itself with argv[1]="kernel" to start kernel
 # subprocesses.
@@ -73,8 +70,8 @@ def main(unused_argv):
       notebookapp.ip = "0.0.0.0"
       notebookapp.password = passwd(FLAGS.password)
     else:
-      print ("\nNo password specified; Notebook server will only be available"
-             " on the local machine.\n")
+      print("\nNo password specified; Notebook server will only be available"
+            " on the local machine.\n")
     notebookapp.initialize(argv=["--notebook-dir", FLAGS.notebook_dir])
 
     if notebookapp.ip == "0.0.0.0":
@@ -125,8 +122,8 @@ if __name__ == "__main__":
   # kernel app.
   if IS_KERNEL:
     # Drop everything except --flagfile.
-    sys.argv = ([sys.argv[0]] +
-                [x for x in sys.argv[1:] if x.startswith("--flagfile")])
+    sys.argv = (
+        [sys.argv[0]] + [x for x in sys.argv[1:] if x.startswith("--flagfile")])
 
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 1481a4d035cbc63aa655be6c4d441e6f6741e118..6befeb846d84ec76cc8fa7d30941ec5ab1224cc8 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A client interface for TensorFlow."""
 
 from __future__ import absolute_import
@@ -36,6 +35,7 @@ from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class SessionInterface(object):
@@ -71,8 +71,9 @@ def _get_indexed_slices_value_from_fetches(fetched_vals):
 
 
 def _get_feeds_for_indexed_slices(feed, feed_val):
-  return list(zip([feed.values, feed.indices] if feed.dense_shape is None else
-                  [feed.values, feed.indices, feed.dense_shape], feed_val))
+  return list(
+      zip([feed.values, feed.indices] if feed.dense_shape is None else
+          [feed.values, feed.indices, feed.dense_shape], feed_val))
 
 
 # List of extensions supported to convert run arguments into actual fetches and
@@ -124,6 +125,7 @@ _REGISTERED_EXPANSIONS = [
      lambda fetch: ([fetch], lambda fetched_vals: fetched_vals[0]),
      lambda feed, feed_val: [(feed, feed_val)],
      lambda feed: [feed])]
+
 # pylint: enable=g-long-lambda
 
 
@@ -132,8 +134,11 @@ def _convert_to_numpy_obj(numpy_dtype, obj):
   return numpy_dtype(obj) if numpy_dtype is not object else str(obj)
 
 
-def register_session_run_conversion_functions(tensor_type, fetch_function,
-    feed_function=None, feed_function_for_partial_run=None):
+def register_session_run_conversion_functions(
+    tensor_type,
+    fetch_function,
+    feed_function=None,
+    feed_function_for_partial_run=None):
   """Register fetch and feed conversion functions for `tf.Session.run()`.
 
   This function registers a triple of conversion functions for fetching and/or
@@ -174,11 +179,11 @@ def register_session_run_conversion_functions(tensor_type, fetch_function,
   """
   for conversion_function in _REGISTERED_EXPANSIONS:
     if issubclass(conversion_function[0], tensor_type):
-      raise ValueError(
-          '%s has already been registered so ignore it.', tensor_type)
+      raise ValueError('%s has already been registered so ignore it.',
+                       tensor_type)
       return
-  _REGISTERED_EXPANSIONS.insert(0,
-    (tensor_type, fetch_function, feed_function, feed_function_for_partial_run))
+  _REGISTERED_EXPANSIONS.insert(0, (tensor_type, fetch_function, feed_function,
+                                    feed_function_for_partial_run))
 
 
 class _FetchMapper(object):
@@ -233,8 +238,8 @@ class _FetchMapper(object):
       An instance of a subclass of `_FetchMapper` that handles the shape.
     """
     if fetch is None:
-      raise TypeError('Fetch argument %r has invalid type %r' %
-                      (fetch, type(fetch)))
+      raise TypeError('Fetch argument %r has invalid type %r' % (fetch,
+                                                                 type(fetch)))
     elif isinstance(fetch, (list, tuple)):
       # NOTE(touts): This is also the code path for namedtuples.
       return _ListFetchMapper(fetch)
@@ -247,8 +252,8 @@ class _FetchMapper(object):
           fetches, contraction_fn = fetch_fn(fetch)
           return _ElementFetchMapper(fetches, contraction_fn)
     # Did not find anything.
-    raise TypeError('Fetch argument %r has invalid type %r' %
-                    (fetch, type(fetch)))
+    raise TypeError('Fetch argument %r has invalid type %r' % (fetch,
+                                                               type(fetch)))
 
 
 class _ElementFetchMapper(_FetchMapper):
@@ -277,8 +282,8 @@ class _ElementFetchMapper(_FetchMapper):
             fetch, allow_tensor=True, allow_operation=True))
       except TypeError as e:
         raise TypeError('Fetch argument %r has invalid type %r, '
-                        'must be a string or Tensor. (%s)'
-                        % (fetch, type(fetch), str(e)))
+                        'must be a string or Tensor. (%s)' %
+                        (fetch, type(fetch), str(e)))
       except ValueError as e:
         raise ValueError('Fetch argument %r cannot be interpreted as a '
                          'Tensor. (%s)' % (fetch, str(e)))
@@ -376,8 +381,9 @@ class _DictFetchMapper(_FetchMapper):
     """
     self._fetch_type = type(fetches)
     self._keys = fetches.keys()
-    self._mappers = [_FetchMapper.for_fetch(fetch)
-                     for fetch in fetches.values()]
+    self._mappers = [
+        _FetchMapper.for_fetch(fetch) for fetch in fetches.values()
+    ]
     self._unique_fetches, self._value_indices = _uniquify_fetches(self._mappers)
 
   def unique_fetches(self):
@@ -401,6 +407,7 @@ class _FetchHandler(object):
   result structure matching the user-provided structure for fetches, but
   containing the corresponding results.
   """
+
   # TODO(touts): Make this class also take care of destructuring the feed
   # dict instead of doing it in the callers.
 
@@ -551,8 +558,11 @@ class _DeviceAttributes(object):
     return self._memory_limit_bytes
 
   def __repr__(self):
-    return '_DeviceAttributes(%s, %s, %d)' % (self.name, self.device_type,
-                                              self.memory_limit_bytes,)
+    return '_DeviceAttributes(%s, %s, %d)' % (
+        self.name,
+        self.device_type,
+        self.memory_limit_bytes,
+    )
 
 
 class BaseSession(SessionInterface):
@@ -601,8 +611,8 @@ class BaseSession(SessionInterface):
 
     if config is not None:
       if not isinstance(config, config_pb2.ConfigProto):
-        raise TypeError('config must be a tf.ConfigProto, but got %s'
-                        % type(config))
+        raise TypeError(
+            'config must be a tf.ConfigProto, but got %s' % type(config))
       self._config = config
       self._add_shapes = config.graph_options.infer_shapes
     else:
@@ -976,8 +986,8 @@ class BaseSession(SessionInterface):
       for tensor_type, _, _, feed_fn in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed)
-      raise TypeError('Feed argument %r has invalid type %r'
-                      % (feed, type(feed)))
+      raise TypeError('Feed argument %r has invalid type %r' % (feed,
+                                                                type(feed)))
 
     # Check session.
     if self._closed:
@@ -998,8 +1008,8 @@ class BaseSession(SessionInterface):
     for feed in feeds:
       for subfeed in _feed_fn(feed):
         try:
-          subfeed_t = self.graph.as_graph_element(subfeed, allow_tensor=True,
-                                                  allow_operation=False)
+          subfeed_t = self.graph.as_graph_element(
+              subfeed, allow_tensor=True, allow_operation=False)
           if self._created_with_new_api:
             # pylint: disable=protected-access
             feed_list.append(subfeed_t._as_tf_output())
@@ -1007,8 +1017,7 @@ class BaseSession(SessionInterface):
           else:
             feed_list.append(compat.as_bytes(subfeed_t.name))
         except Exception as e:
-          e.message = ('Cannot interpret feed_list key as Tensor: '
-                       + e.message)
+          e.message = ('Cannot interpret feed_list key as Tensor: ' + e.message)
           e.args = (e.message,)
           raise e
 
@@ -1041,12 +1050,13 @@ class BaseSession(SessionInterface):
 
   def _run(self, handle, fetches, feed_dict, options, run_metadata):
     """Perform either run or partial_run, depending the presence of `handle`."""
+
     def _feed_fn(feed, feed_val):
       for tensor_type, _, feed_fn, _ in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed, feed_val)
-      raise TypeError('Feed argument %r has invalid type %r'
-                      % (feed, type(feed)))
+      raise TypeError('Feed argument %r has invalid type %r' % (feed,
+                                                                type(feed)))
 
     # Check session.
     if self._closed:
@@ -1066,11 +1076,11 @@ class BaseSession(SessionInterface):
       for feed, feed_val in feed_dict.items():
         for subfeed, subfeed_val in _feed_fn(feed, feed_val):
           try:
-            subfeed_t = self.graph.as_graph_element(subfeed, allow_tensor=True,
-                                                    allow_operation=False)
+            subfeed_t = self.graph.as_graph_element(
+                subfeed, allow_tensor=True, allow_operation=False)
           except Exception as e:
-            raise TypeError('Cannot interpret feed_dict key as Tensor: '
-                            + e.args[0])
+            raise TypeError(
+                'Cannot interpret feed_dict key as Tensor: ' + e.args[0])
 
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
@@ -1081,10 +1091,9 @@ class BaseSession(SessionInterface):
           if isinstance(subfeed_val, int) and _convert_to_numpy_obj(
               subfeed_dtype, subfeed_val) != subfeed_val:
             raise TypeError(
-                'Type of feed value ' + str(subfeed_val) + ' with type ' +
-                str(type(subfeed_val)) +
-                ' is not compatible with Tensor type ' +
-                str(subfeed_dtype) +
+                'Type of feed value ' + str(subfeed_val) + ' with type ' + str(
+                    type(subfeed_val)) +
+                ' is not compatible with Tensor type ' + str(subfeed_dtype) +
                 '. Try explicitly setting the type of the feed tensor'
                 ' to a larger type (e.g. int64).')
 
@@ -1098,10 +1107,10 @@ class BaseSession(SessionInterface):
 
           if (not is_tensor_handle_feed and
               not subfeed_t.get_shape().is_compatible_with(np_val.shape)):
-            raise ValueError(
-                'Cannot feed value of shape %r for Tensor %r, '
-                'which has shape %r'
-                % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
+            raise ValueError('Cannot feed value of shape %r for Tensor %r, '
+                             'which has shape %r' %
+                             (np_val.shape, subfeed_t.name,
+                              str(subfeed_t.get_shape())))
           if not self.graph.is_feedable(subfeed_t):
             raise ValueError('Tensor %s may not be fed.' % subfeed_t)
 
@@ -1130,10 +1139,7 @@ class BaseSession(SessionInterface):
       results = []
     return fetch_handler.build_results(self, results)
 
-  def make_callable(self,
-                    fetches,
-                    feed_list=None,
-                    accept_options=False):
+  def make_callable(self, fetches, feed_list=None, accept_options=False):
     """Returns a Python callable that runs a particular step.
 
     The returned callable will take `len(feed_list)` arguments whose types
@@ -1176,9 +1182,12 @@ class BaseSession(SessionInterface):
       # `Session._run()` so that we can convert the feeds to a list of
       # strings here.
       def _generic_run(*feed_args, **kwargs):
-        feed_dict = {feed: feed_val
-                     for feed, feed_val in zip(feed_list, feed_args)}
+        feed_dict = {
+            feed: feed_val
+            for feed, feed_val in zip(feed_list, feed_args)
+        }
         return self.run(fetches, feed_dict=feed_dict, **kwargs)
+
       return _generic_run
 
     # Ensure any changes to the graph are reflected in the runtime.
@@ -1198,12 +1207,11 @@ class BaseSession(SessionInterface):
       fetch_list = _name_list(fetch_handler.fetches())
       target_list = _name_list(fetch_handler.targets())
 
-    def _callable_template_with_options_and_metadata(
-        fetch_list,
-        target_list,
-        fetch_handler,
-        options=None,
-        run_metadata=None):
+    def _callable_template_with_options_and_metadata(fetch_list,
+                                                     target_list,
+                                                     fetch_handler,
+                                                     options=None,
+                                                     run_metadata=None):
       """Template callable that accepts RunOptions and RunMetadata."""
       options_ptr = tf_session.TF_NewBufferFromString(
           compat.as_bytes(options.SerializeToString())) if options else None
@@ -1215,9 +1223,9 @@ class BaseSession(SessionInterface):
                 self._session, options_ptr, {}, fetch_list, target_list,
                 run_metadata_ptr, status)
           else:
-            results = tf_session.TF_Run(
-                self._session, options_ptr, {}, fetch_list, target_list, status,
-                run_metadata_ptr)
+            results = tf_session.TF_Run(self._session, options_ptr, {},
+                                        fetch_list, target_list, status,
+                                        run_metadata_ptr)
           if fetch_handler:
             results = fetch_handler.build_results(self, results)
           else:
@@ -1233,37 +1241,40 @@ class BaseSession(SessionInterface):
       return results
 
     if accept_options:
-      return functools.partial(
-          _callable_template_with_options_and_metadata, fetch_list,
-          target_list, fetch_handler)
+      return functools.partial(_callable_template_with_options_and_metadata,
+                               fetch_list, target_list, fetch_handler)
     elif isinstance(fetches, ops.Operation):
       # Special case for fetching a single operation, because the
       # function will have no return value.
       assert not fetch_list
       assert len(target_list) == 1
+
       def _single_operation_run():
         with errors.raise_exception_on_not_ok_status() as status:
           if self._created_with_new_api:
-            tf_session.TF_SessionRun_wrapper(
-                self._session, None, {}, [], target_list, None, status)
+            tf_session.TF_SessionRun_wrapper(self._session, None, {}, [],
+                                             target_list, None, status)
           else:
-            tf_session.TF_Run(
-                self._session, None, {}, [], target_list, status, None)
+            tf_session.TF_Run(self._session, None, {}, [], target_list, status,
+                              None)
+
       return _single_operation_run
     elif isinstance(fetches, ops.Tensor):
       # Special case for fetching a single tensor, because the
       # function can return the result of `TF_Run()` directly.
       assert len(fetch_list) == 1
       assert not target_list
+
       def _single_tensor_run():
         with errors.raise_exception_on_not_ok_status() as status:
           if self._created_with_new_api:
             results = tf_session.TF_SessionRun_wrapper(
                 self._session, None, {}, fetch_list, [], None, status)
           else:
-            results = tf_session.TF_Run(
-                self._session, None, {}, fetch_list, [], status, None)
+            results = tf_session.TF_Run(self._session, None, {}, fetch_list, [],
+                                        status, None)
         return results[0]
+
       return _single_tensor_run
     else:
       # In all other cases, we must use `fetch_handler` to build the
@@ -1274,16 +1285,17 @@ class BaseSession(SessionInterface):
             results = tf_session.TF_SessionRun_wrapper(
                 self._session, None, {}, fetch_list, target_list, None, status)
           else:
-            results = tf_session.TF_Run(
-                self._session, None, {}, fetch_list, target_list, status, None)
+            results = tf_session.TF_Run(self._session, None, {}, fetch_list,
+                                        target_list, status, None)
         return fetch_handler.build_results(self, results)
+
       return _fetch_handler_run
 
   # Captures the name of a node in an error status.
   _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
 
-  def _do_run(self, handle, target_list, fetch_list, feed_dict,
-              options, run_metadata):
+  def _do_run(self, handle, target_list, fetch_list, feed_dict, options,
+              run_metadata):
     """Runs a step based on the given fetches and feeds.
 
     Args:
@@ -1320,13 +1332,12 @@ class BaseSession(SessionInterface):
       self._extend_graph()
       with errors.raise_exception_on_not_ok_status() as status:
         if self._created_with_new_api:
-          return tf_session.TF_SessionRun_wrapper(
-              session, options, feed_dict, fetch_list, target_list,
-              run_metadata, status)
+          return tf_session.TF_SessionRun_wrapper(session, options, feed_dict,
+                                                  fetch_list, target_list,
+                                                  run_metadata, status)
         else:
-          return tf_session.TF_Run(session, options,
-                                   feed_dict, fetch_list, target_list,
-                                   status, run_metadata)
+          return tf_session.TF_Run(session, options, feed_dict, fetch_list,
+                                   target_list, status, run_metadata)
 
     def _prun_fn(session, handle, feed_dict, fetch_list):
       if target_list:
@@ -1365,20 +1376,20 @@ class BaseSession(SessionInterface):
   def _extend_graph(self):
     # Nothing to do if we're using the new session interface
     # TODO(skyewm): remove this function altogether eventually
-    if self._created_with_new_api: return
+    if self._created_with_new_api:
+      return
 
     # Ensure any changes to the graph are reflected in the runtime.
     with self._extend_lock:
       if self._graph.version > self._current_version:
         # pylint: disable=protected-access
         graph_def, self._current_version = self._graph._as_graph_def(
-            from_version=self._current_version,
-            add_shapes=self._add_shapes)
+            from_version=self._current_version, add_shapes=self._add_shapes)
         # pylint: enable=protected-access
 
         with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_ExtendGraph(
-              self._session, graph_def.SerializeToString(), status)
+          tf_session.TF_ExtendGraph(self._session,
+                                    graph_def.SerializeToString(), status)
         self._opened = True
 
   # The threshold to run garbage collection to delete dead tensors.
@@ -1398,9 +1409,8 @@ class BaseSession(SessionInterface):
       feeds = {}
       fetches = []
       for deleter_key, tensor_handle in enumerate(tensors_to_delete):
-        holder, deleter = session_ops._get_handle_deleter(self.graph,
-                                                          deleter_key,
-                                                          tensor_handle)
+        holder, deleter = session_ops._get_handle_deleter(
+            self.graph, deleter_key, tensor_handle)
         feeds[holder] = tensor_handle
         fetches.append(deleter)
       self.run(fetches, feed_dict=feeds)
@@ -1432,6 +1442,7 @@ class BaseSession(SessionInterface):
       return handles
 
 
+@tf_export('Session')
 class Session(BaseSession):
   """A class for running TensorFlow operations.
 
@@ -1471,7 +1482,8 @@ class Session(BaseSession):
     sess.run(...)
   ```
 
-  The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+  The
+  [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
   protocol buffer exposes various configuration options for a
   session. For example, to create a session that uses soft constraints
   for device placement, and log the resulting placement decisions,
@@ -1502,7 +1514,8 @@ class Session(BaseSession):
         @{$distributed$Distributed TensorFlow}
         for more examples.
       graph: (Optional.) The `Graph` to be launched (described above).
-      config: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+      config: (Optional.) A
+        [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
         protocol buffer with configuration options for the session.
 
     """
@@ -1526,8 +1539,8 @@ class Session(BaseSession):
   def __exit__(self, exec_type, exec_value, exec_tb):
     if exec_type is errors.OpError:
       logging.error('Session closing due to OpError: %s', (exec_value,))
-    self._default_session_context_manager.__exit__(
-        exec_type, exec_value, exec_tb)
+    self._default_session_context_manager.__exit__(exec_type, exec_value,
+                                                   exec_tb)
     self._default_graph_context_manager.__exit__(exec_type, exec_value, exec_tb)
 
     self._default_session_context_manager = None
@@ -1570,6 +1583,7 @@ class Session(BaseSession):
     tf_session.TF_Reset(target, containers, config)
 
 
+@tf_export('InteractiveSession')
 class InteractiveSession(BaseSession):
   """A TensorFlow `Session` for use in interactive contexts, such as a shell.
 
diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py
index 721bca91b71aa00479c27fad102d5888d58d35b1..06e9a099267938e53e377e65ed975f7f4b8b966b 100644
--- a/tensorflow/python/client/session_benchmark.py
+++ b/tensorflow/python/client/session_benchmark.py
@@ -22,6 +22,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index c579fba33951c4624e02de1e20a9aa5bad11cd73..768a5db88aa647609dba1c479a5aca68cd26652a 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for tensorflow.python.client.session.Session."""
 from __future__ import absolute_import
 from __future__ import division
@@ -57,7 +56,6 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
@@ -95,14 +93,18 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(arr, copy_val)
       # Test without feed.
       copy_val = copy.eval()
-      self.assertAllEqual(np.asarray([[10.0, 10.0, 10.0], [10.0, 10.0, 10.0]],
-                                     dtype=np.float32), copy_val)
+      self.assertAllEqual(
+          np.asarray(
+              [[10.0, 10.0, 10.0], [10.0, 10.0, 10.0]], dtype=np.float32),
+          copy_val)
 
   def testManyCPUs(self):
     # TODO(keveman): Implement ListDevices and test for the number of
     # devices returned by ListDevices.
     with session.Session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})):
+        config=config_pb2.ConfigProto(device_count={
+            'CPU': 2
+        })):
       inp = constant_op.constant(10.0, name='W1')
       self.assertAllEqual(inp.eval(), 10.0)
 
@@ -161,20 +163,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       def exc_predicate(e):
         return (e.op is None and e.node_def is None and
                 e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+
       with self.assertRaisesOpError(exc_predicate):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
   def testOpConstructionErrorPayload(self):
-    if ops._USE_C_API: return  # No shape registration for 'ConstructionFails'
+    if ops._USE_C_API:
+      return  # No shape registration for 'ConstructionFails'
 
     with session.Session():
       failing_op = ops.get_default_graph().create_op(
           'ConstructionFails', [], [], name='f')
 
       def exc_predicate(e):
-        return (e.op == failing_op
-                and e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+        return (e.op == failing_op and
+                e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+
       with self.assertRaisesOpError(exc_predicate):
         failing_op.run()
 
@@ -191,9 +196,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       # pylint: enable=protected-access
 
       def exc_predicate(e):
-        return (e.op == c.op
-                and e.op._original_op == b.op
-                and e.op._original_op._original_op == a.op)
+        return (e.op == c.op and e.op._original_op == b.op and
+                e.op._original_op._original_op == a.op)
+
       with self.assertRaisesOpError(exc_predicate):
         c.eval()
 
@@ -341,8 +346,12 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(c_val)
       # List of lists, tuples, namedtuple, and dict
-      res = sess.run([[a, b, c], (a, b, c), ABC(a=a, b=b, c=c),
-                      {'a': a.name, 'c': c, 'b': b}])
+      res = sess.run([[a, b, c], (a, b, c),
+                      ABC(a=a, b=b, c=c), {
+                          'a': a.name,
+                          'c': c,
+                          'b': b
+                      }])
       self.assertTrue(isinstance(res, list))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res[0], list))
@@ -365,8 +374,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res[3]['b'])
       self.assertEqual(c_val, res[3]['c'])
       # Tuple of lists, tuples, namedtuple, and dict
-      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c),
-                      {'a': a, 'c': c, 'b': b}))
+      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c), {
+          'a': a,
+          'c': c,
+          'b': b
+      }))
       self.assertTrue(isinstance(res, tuple))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res[0], list))
@@ -389,10 +401,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res[3]['b'])
       self.assertEqual(c_val, res[3]['c'])
       # Namedtuple of lists, tuples, namedtuples, and dict
-      res = sess.run(DEFG(d=[a, b, c],
-                          e=(a, b, c),
-                          f=ABC(a=a.name, b=b, c=c),
-                          g={'a': a, 'c': c, 'b': b}))
+      res = sess.run(
+          DEFG(
+              d=[a, b, c],
+              e=(a, b, c),
+              f=ABC(a=a.name, b=b, c=c),
+              g={
+                  'a': a,
+                  'c': c,
+                  'b': b
+              }))
       self.assertTrue(isinstance(res, DEFG))
       self.assertTrue(isinstance(res.d, list))
       self.assertEqual(3, len(res.d))
@@ -414,10 +432,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res.g['b'])
       self.assertEqual(c_val, res.g['c'])
       # Dict of lists, tuples, namedtuples, and dict
-      res = sess.run({'d': [a, b, c],
-                      'e': (a, b, c),
-                      'f': ABC(a=a, b=b, c=c),
-                      'g': {'a': a.name, 'c': c, 'b': b}})
+      res = sess.run({
+          'd': [a, b, c],
+          'e': (a, b, c),
+          'f': ABC(a=a, b=b, c=c),
+          'g': {
+              'a': a.name,
+              'c': c,
+              'b': b
+          }
+      })
       self.assertTrue(isinstance(res, dict))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res['d'], list))
@@ -516,8 +540,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       values = np.array([1.0, 2.0]).astype(np.float32)
       shape = np.array([7, 9, 2]).astype(np.int64)
       sp = sparse_tensor.SparseTensor(
-          constant_op.constant(indices),
-          constant_op.constant(values),
+          constant_op.constant(indices), constant_op.constant(values),
           constant_op.constant(shape))
       # Single fetch, use as tuple
       sp_out = s.run(sp)
@@ -587,14 +610,17 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp = sparse_tensor.SparseTensor(
           array_ops.placeholder(dtype=np.int64, shape=(2, 3)),
           array_ops.placeholder(dtype=np.float32, shape=(2,)),
-          array_ops.placeholder(dtype=np.int64, shape=(3,)),)
+          array_ops.placeholder(dtype=np.int64, shape=(3,)),
+      )
       sp_indices = array_ops.identity(sp.indices)
       sp_values = array_ops.identity(sp.values)
       sp_shape = array_ops.identity(sp.dense_shape)
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
@@ -605,20 +631,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sp_out.dense_shape, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
       # Feed SparseTensorValue and fetch sp directly.
-      sp_out = s.run(
-          sp, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp_out = s.run(sp, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
       self.assertAllEqual(sp_out.dense_shape, shape)
@@ -635,20 +664,24 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
@@ -666,20 +699,24 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
@@ -689,9 +726,8 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       values = np.array([1.0, 2.0]).astype(np.float32)
       shape = np.array([7, 9, 2]).astype(np.int64)
-      sp = array_ops.sparse_placeholder(dtype=np.float32,
-                                        shape=shape,
-                                        name='placeholder1')
+      sp = array_ops.sparse_placeholder(
+          dtype=np.float32, shape=shape, name='placeholder1')
       self.assertAllEqual(sp.dense_shape.eval(session=s), shape)
       self.assertAllEqual(tensor_util.constant_value(sp.dense_shape), shape)
       sp_indices = array_ops.identity(sp.indices)
@@ -699,7 +735,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp_shape = array_ops.identity(sp.dense_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
@@ -745,33 +783,34 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       dense_shape = np.array([7, 9, 2]).astype(np.int64)
       ind = ops.IndexedSlices(
-          array_ops.placeholder(dtype=np.float32,
-                                shape=(2,)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(2, 3)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(3,)),)
+          array_ops.placeholder(dtype=np.float32, shape=(2,)),
+          array_ops.placeholder(dtype=np.int64, shape=(2, 3)),
+          array_ops.placeholder(dtype=np.int64, shape=(3,)),
+      )
       ind_values = array_ops.identity(ind.values)
       ind_indices = array_ops.identity(ind.indices)
       ind_dense_shape = array_ops.identity(ind.dense_shape)
       ind2 = ops.IndexedSlices(ind_values, ind_indices, ind_dense_shape)
       # Feed with tuple
       values_out, indices_out, dense_shape_out = s.run(
-          [ind_values, ind_indices, ind_dense_shape],
-          {ind: (values, indices, dense_shape)})
+          [ind_values, ind_indices, ind_dense_shape], {
+              ind: (values, indices, dense_shape)
+          })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(dense_shape_out, dense_shape)
       # Feed with IndexedSlicesValue
       values_out, indices_out, dense_shape_out = s.run(
-          [ind_values, ind_indices, ind_dense_shape],
-          {ind: ops.IndexedSlicesValue(values, indices, dense_shape)})
+          [ind_values, ind_indices, ind_dense_shape], {
+              ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+          })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(dense_shape_out, dense_shape)
       # Feed with IndexedSlicesValue, fetch IndexedSlicesValue
-      ind2_out = s.run(ind2, {ind: ops.IndexedSlicesValue(values, indices,
-                                                          dense_shape)})
+      ind2_out = s.run(ind2, {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(ind2_out.values, values)
       self.assertAllEqual(ind2_out.indices, indices)
       self.assertAllEqual(ind2_out.dense_shape, dense_shape)
@@ -816,28 +855,27 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       dense_shape = None
       ind = ops.IndexedSlices(
-          array_ops.placeholder(dtype=np.float32,
-                                shape=(2,)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(2, 3)),
-          None)
+          array_ops.placeholder(dtype=np.float32, shape=(2,)),
+          array_ops.placeholder(dtype=np.int64, shape=(2, 3)), None)
       ind_values = array_ops.identity(ind.values)
       ind_indices = array_ops.identity(ind.indices)
       ind2 = ops.IndexedSlices(ind_values, ind_indices)
       # Feed with tuple
-      values_out, indices_out = s.run(
-          [ind_values, ind_indices], {ind: (values, indices)})
+      values_out, indices_out = s.run([ind_values, ind_indices], {
+          ind: (values, indices)
+      })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       # Feed with IndexedSlicesValue
-      values_out, indices_out = s.run(
-          [ind_values, ind_indices],
-          {ind: ops.IndexedSlicesValue(values, indices, dense_shape)})
+      values_out, indices_out = s.run([ind_values, ind_indices], {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       # Feed with IndexedSlicesValue, fetch IndexedSlicesValue
-      ind2_out = s.run(ind2, {ind: ops.IndexedSlicesValue(values, indices,
-                                                          dense_shape)})
+      ind2_out = s.run(ind2, {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(ind2_out.values, values)
       self.assertAllEqual(ind2_out.indices, indices)
       self.assertAllEqual(ind2_out.dense_shape, dense_shape)
@@ -986,8 +1024,9 @@ class SessionTest(test_util.TensorFlowTestCase):
     constructed_events = [threading.Event() for _ in range(10)]
     continue_event = threading.Event()
     for i, constructed_event in enumerate(constructed_events):
-      t = self.checkedThread(target=self._testDefaultGraphInThread,
-                             args=(constructed_event, continue_event, i))
+      t = self.checkedThread(
+          target=self._testDefaultGraphInThread,
+          args=(constructed_event, continue_event, i))
       threads.append(t)
     for t in threads:
       t.start()
@@ -1006,6 +1045,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         ev.wait()
         val = c.eval(session=sess)
         self.assertEqual(val, 5.0)
+
       threads = [self.checkedThread(target=run_step) for _ in range(100)]
       for t in threads:
         t.start()
@@ -1038,11 +1078,10 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testGraphDef(self):
     with session.Session() as sess:
-      self.assertProtoEquals(
-          'versions { producer: %d min_consumer: %d }' % (
-              versions.GRAPH_DEF_VERSION,
-              versions.GRAPH_DEF_VERSION_MIN_CONSUMER),
-          sess.graph_def)
+      self.assertProtoEquals('versions { producer: %d min_consumer: %d }' %
+                             (versions.GRAPH_DEF_VERSION,
+                              versions.GRAPH_DEF_VERSION_MIN_CONSUMER),
+                             sess.graph_def)
       c = constant_op.constant(5.0, name='c')
       self.assertEquals(len(sess.graph_def.node), 1)
       d = constant_op.constant(6.0, name='d')
@@ -1072,6 +1111,7 @@ class SessionTest(test_util.TensorFlowTestCase):
             lambda e: 'Attempted to use a closed Session.' in str(e)):
           while True:
             sess.run(c)
+
       t = threading.Thread(target=update_thread)
       t.start()
       time.sleep(0.1)
@@ -1177,17 +1217,11 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testFeedAndFetch(self):
     with session.Session() as sess:
-      for dtype in [dtypes.float16,
-                    dtypes.float32,
-                    dtypes.float64,
-                    dtypes.int32,
-                    dtypes.uint8,
-                    dtypes.int16,
-                    dtypes.int8,
-                    dtypes.int64,
-                    dtypes.bool,
-                    dtypes.complex64,
-                    dtypes.complex128]:
+      for dtype in [
+          dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+          dtypes.uint8, dtypes.int16, dtypes.int8, dtypes.int64, dtypes.bool,
+          dtypes.complex64, dtypes.complex128
+      ]:
         for shape in [(32, 4, 128), (37,), (2, 0, 6), (0, 0, 0)]:
           np_dtype = dtype.as_numpy_dtype
 
@@ -1206,13 +1240,19 @@ class SessionTest(test_util.TensorFlowTestCase):
             np_array = np_array.astype(np_dtype)
 
           self.assertAllEqual(np_array,
-                              sess.run(out_t, feed_dict={feed_t: np_array}))
+                              sess.run(out_t, feed_dict={
+                                  feed_t: np_array
+                              }))
           # Check that we can also get the feed back.
           self.assertAllEqual(np_array,
-                              sess.run(feed_t, feed_dict={feed_t: np_array}))
+                              sess.run(feed_t, feed_dict={
+                                  feed_t: np_array
+                              }))
           # Also check that we can get both back.
-          out_v, feed_v = sess.run([out_t, feed_t],
-                                   feed_dict={feed_t: np_array})
+          out_v, feed_v = sess.run(
+              [out_t, feed_t], feed_dict={
+                  feed_t: np_array
+              })
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
@@ -1257,9 +1297,11 @@ class SessionTest(test_util.TensorFlowTestCase):
           trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
       self.assertEqual(0, len(run_metadata.step_stats.dev_stats))
-      self.assertAllClose(
-          42.0,
-          tensor_runner(41.0, options=run_options, run_metadata=run_metadata))
+      self.assertAllClose(42.0,
+                          tensor_runner(
+                              41.0,
+                              options=run_options,
+                              run_metadata=run_metadata))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
   def testFeedError(self):
@@ -1296,8 +1338,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         size = 1
         for s in shape:
           size *= s
-        c_list = np.array([compat.as_bytes(str(i)) for i in xrange(size)],
-                          dtype=np.object).reshape(shape) if size > 0 else []
+        c_list = np.array(
+            [compat.as_bytes(str(i)) for i in xrange(size)],
+            dtype=np.object).reshape(shape) if size > 0 else []
         c = constant_op.constant(c_list)
         self.assertAllEqual(c.eval(), c_list)
 
@@ -1307,13 +1350,16 @@ class SessionTest(test_util.TensorFlowTestCase):
         size = 1
         for s in shape:
           size *= s
-        c_list = np.array([compat.as_bytes(str(i)) for i in xrange(size)],
-                          dtype=np.object).reshape(shape)
+        c_list = np.array(
+            [compat.as_bytes(str(i)) for i in xrange(size)],
+            dtype=np.object).reshape(shape)
         feed_t = array_ops.placeholder(dtype=dtypes.string, shape=shape)
         c = array_ops.identity(feed_t)
         self.assertAllEqual(sess.run(c, feed_dict={feed_t: c_list}), c_list)
-        self.assertAllEqual(sess.run(feed_t, feed_dict={feed_t: c_list}),
-                            c_list)
+        self.assertAllEqual(
+            sess.run(feed_t, feed_dict={
+                feed_t: c_list
+            }), c_list)
         c_v, feed_v = sess.run([c, feed_t], feed_dict={feed_t: c_list})
         self.assertAllEqual(c_v, c_list)
         self.assertAllEqual(feed_v, c_list)
@@ -1329,8 +1375,10 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testStringFeedWithUnicode(self):
     with session.Session():
-      c_list = [u'\n\x01\x00', u'\n\x00\x01',
-                u'\u26a3 unicode', u'\U0001f60e deal with it']
+      c_list = [
+          u'\n\x01\x00', u'\n\x00\x01', u'\u26a3 unicode',
+          u'\U0001f60e deal with it'
+      ]
       feed_t = array_ops.placeholder(dtype=dtypes.string, shape=[len(c_list)])
       c = array_ops.identity(feed_t)
 
@@ -1423,9 +1471,10 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(constant_op.constant(1.0), run_metadata=run_metadata)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0),
-                 options=run_options,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0),
+            options=run_options,
+            run_metadata=run_metadata)
 
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
@@ -1439,23 +1488,26 @@ class SessionTest(test_util.TensorFlowTestCase):
       with session.Session() as sess:
         # all combinations are valid
         sess.run(constant_op.constant(1.0), options=None, run_metadata=None)
-        sess.run(constant_op.constant(1.0), options=None,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0), options=None, run_metadata=run_metadata)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0), options=run_options,
-                 run_metadata=None)
+        sess.run(
+            constant_op.constant(1.0), options=run_options, run_metadata=None)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0), options=run_options,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0),
+            options=run_options,
+            run_metadata=run_metadata)
 
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
   def testFeedShapeCompatibility(self):
     # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
-    if ops._USE_C_API: return
+    if ops._USE_C_API:
+      return
 
     with session.Session() as sess:
       some_tensor = constant_op.constant([2.0, 2.0, 2.0, 2.0])
@@ -1499,8 +1551,11 @@ class SessionTest(test_util.TensorFlowTestCase):
         d = math_ops.multiply(c, c)
       for step in xrange(120):
         run_metadata = config_pb2.RunMetadata()
-        sess.run(d, feed_dict={a: 1.0},
-                 options=run_options, run_metadata=run_metadata)
+        sess.run(
+            d,
+            feed_dict={a: 1.0},
+            options=run_options,
+            run_metadata=run_metadata)
         if step == 99:
           self.assertTrue(run_metadata.HasField('cost_graph'))
         else:
@@ -1569,8 +1624,7 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testTimeoutWithShortOperations(self):
     num_epochs = 5
-    q = data_flow_ops.FIFOQueue(
-        capacity=50, dtypes=[dtypes.int32], shapes=[()])
+    q = data_flow_ops.FIFOQueue(capacity=50, dtypes=[dtypes.int32], shapes=[()])
     enqueue_op = q.enqueue_many(constant_op.constant([1, 2]))
 
     # Use a 10-second timeout, which should be longer than any
@@ -1582,7 +1636,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
   def testRegisterFetchAndFeedConversionFunctions(self):
+
     class SquaredTensor(object):
+
       def __init__(self, tensor):
         self.sq = math_ops.square(tensor)
 
@@ -1591,24 +1647,27 @@ class SessionTest(test_util.TensorFlowTestCase):
     feed_fn2 = lambda feed: [feed.sq]
 
     session.register_session_run_conversion_functions(SquaredTensor, fetch_fn,
-        feed_fn1, feed_fn2)
+                                                      feed_fn1, feed_fn2)
     with self.assertRaises(ValueError):
-      session.register_session_run_conversion_functions(SquaredTensor,
-          fetch_fn, feed_fn1, feed_fn2)
+      session.register_session_run_conversion_functions(SquaredTensor, fetch_fn,
+                                                        feed_fn1, feed_fn2)
     with self.test_session() as sess:
       np1 = np.array([1.0, 1.5, 2.0, 2.5])
       np2 = np.array([3.0, 3.5, 4.0, 4.5])
       squared_tensor = SquaredTensor(np2)
       squared_eval = sess.run(squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
-      squared_eval = sess.run(squared_tensor, feed_dict={
-        squared_tensor : np1 * np1})
+      squared_eval = sess.run(
+          squared_tensor, feed_dict={
+              squared_tensor: np1 * np1
+          })
       self.assertAllClose(np1 * np1, squared_eval)
       partial_run = sess.partial_run_setup([squared_tensor], [])
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
   def testDefaultLogDevicePlacement(self):
+
     class CaptureStderr(str):
       """Class to capture stderr from C++ shared library."""
 
@@ -1719,6 +1778,7 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def runTestAddFunctionToSession(self, target=''):
     """Add a function to a session after the graph has already been run."""
+
     @function.Defun(dtypes.float32)
     def foo(x):
       return x + 1
@@ -1753,6 +1813,7 @@ class SessionTest(test_util.TensorFlowTestCase):
           TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
         sess.run(a, feed_dict={a: 1})
 
+
 class GraphMutationTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -1803,8 +1864,7 @@ class GraphMutationTest(test_util.TensorFlowTestCase):
 
     with session.Session(graph=g) as sess:
       self.assertAllEqual(1.0, sess.run(b))
-      b.op._set_attr('DstT',
-                     attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+      b.op._set_attr('DstT', attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
       with self.assertRaisesRegexp(
           errors.FailedPreconditionError,
           'Cast.*was changed by setting attribute after it was run'):
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 5fb389cf92818c7a464cf4a4479d86377185d5cf..8b8adefa65a5c54d40bc28d8f50953513cfd3605 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -59,7 +59,7 @@ tf_py_test(
 
 tf_py_test(
     name = "dataset_from_generator_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["dataset_from_generator_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -357,6 +357,9 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:lookup_ops",
     ],
     grpc_enabled = True,
     tags = [
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
index 45dfa13720b09c7bba979b72a339c13dcd2d827b..2c65c49ebdc1b58f8439f1362a1c55eb3b3ed2d9 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
@@ -21,6 +21,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
@@ -28,6 +29,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
@@ -103,6 +106,40 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:1/cpu:0",
                                    workers[0].target)
 
+  def testCaptureHashTableInSharedIterator(self):
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        shared_name="shared_table")
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        ["brain brain tank salad surgery", "surgery brain"])
+
+    iterator = (
+        input_sentences.map(lambda x: string_ops.string_split([x]).values).map(
+            table.lookup)
+        .make_initializable_iterator(shared_name="shared_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(table.init)
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next))
+
+    with session.Session(worker[0].target) as sess:
+      self.assertAllEqual([2, 0], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c1ba67e4744c6282f0fd3d9a388aabc1ed51267b..c4b7e4919bbbdb4c2096f124b54c264fa62e3fab 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -41,8 +41,10 @@ from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("data.Dataset")
 class Dataset(object):
   """Represents a potentially large set of elements.
 
@@ -556,6 +558,8 @@ class Dataset(object):
         - /path/to/dir/b.py
         - /path/to/dir/c.py
 
+    NOTE: The order of the file names returned can be non-deterministic.
+
     Args:
       file_pattern: A string or scalar string `tf.Tensor`, representing
         the filename pattern that will be matched.
@@ -899,10 +903,11 @@ class Dataset(object):
 
     Args:
       transformation_func: A function that takes one `Dataset` argument and
-        returns a `Dataset`.
+          returns a `Dataset`.
 
     Returns:
-      Dataset: The `Dataset` returned by applying `transformation_func` to this dataset.
+      Dataset: The `Dataset` returned by applying `transformation_func` to this
+          dataset.
     """
     dataset = transformation_func(self)
     if not isinstance(dataset, Dataset):
@@ -1454,6 +1459,19 @@ def _padding_value_to_tensor(value, output_type):
   return value
 
 
+def _default_padding(input_dataset):
+
+  def make_zero(t):
+    if t.base_dtype == dtypes.string:
+      return ""
+    elif t.base_dtype == dtypes.variant:
+      raise TypeError("Unable to create padding for field of type 'variant'")
+    else:
+      return np.zeros_like(t.as_numpy_dtype())
+
+  return nest.map_structure(make_zero, input_dataset.output_types)
+
+
 class PaddedBatchDataset(Dataset):
   """A `Dataset` that batches and pads contiguous elements from its input."""
 
@@ -1469,23 +1487,13 @@ class PaddedBatchDataset(Dataset):
         batch_size, dtype=dtypes.int64, name="batch_size")
     padding_values = (
         padding_values
-        if padding_values is not None else self._default_padding(input_dataset))
+        if padding_values is not None else _default_padding(input_dataset))
     self._padded_shapes = nest.map_structure_up_to(
         input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes)
     self._padding_values = nest.map_structure_up_to(
         input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
         input_dataset.output_types)
 
-  def _default_padding(self, input_dataset):
-
-    def make_zero(t):
-      if t.base_dtype == dtypes.string:
-        return ""
-      else:
-        return np.zeros_like(t.as_numpy_dtype())
-
-    return nest.map_structure(make_zero, input_dataset.output_types)
-
   def _as_variant_tensor(self):
     return gen_dataset_ops.padded_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 0cbdb3ab19d8f1b966a867dfcf709c1a4a49b871..e573fe01928b77dea55a782e4e86a00873346f07 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 # NOTE(mrry): It is legitimate to call `Iterator.get_next()` multiple
@@ -47,6 +48,7 @@ GET_NEXT_CALL_WARNING_MESSAGE = (
     "`next_element` inside the loop.")
 
 
+@tf_export("data.Iterator")
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
 
@@ -165,8 +167,10 @@ class Iterator(object):
     iterator_resource = gen_dataset_ops.iterator(
         container="",
         shared_name=shared_name,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -232,8 +236,10 @@ class Iterator(object):
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     iterator_resource = gen_dataset_ops.iterator_from_string_handle(
         string_handle,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 830dc5cec4a54469d001f0ba57d1adc7bc5efd11..fa7601741b11f018e9b53ed3b77a7561be50d3f4 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -23,12 +23,14 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/64974358): Increase default buffer size to 256 MB.
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
+@tf_export("data.TextLineDataset")
 class TextLineDataset(Dataset):
   """A `Dataset` comprising lines from one or more text files."""
 
@@ -71,6 +73,7 @@ class TextLineDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.TFRecordDataset")
 class TFRecordDataset(Dataset):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
@@ -115,6 +118,7 @@ class TFRecordDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.FixedLengthRecordDataset")
 class FixedLengthRecordDataset(Dataset):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 2455395635c4c8fa5d157a38d4e7a118f554fd9f..e90ce3fb40af68fb68d6ee8bac6892848d8c5a79 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -266,7 +266,7 @@ def map_structure(func, *structure, **check_types_dict):
   and the return value will contain the results in the same structure.
 
   Args:
-    func: A callable that acceps as many arguments are there are structures.
+    func: A callable that accepts as many arguments are there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
@@ -383,8 +383,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
             "structure has keys %s, while shallow structure has keys %s." %
             (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
@@ -479,8 +479,8 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   The `inputs`, can be thought of as having the same structure as
   `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
-  This function therefore will return something with the same base structure as
-  `shallow_tree`.
+  This function, therefore, will return something with the same base structure
+  as `shallow_tree`.
 
   Examples:
 
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index 90dd7dfe7775b2f10611e5579784fbda63fc9669..ff380815a4a32192de621888199e66355f9b4635 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -277,6 +277,10 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
   def testFlattenUpTo(self):
     input_tree = (((2, 2), (3, 3)), ((4, 9), (5, 5)))
     shallow_tree = ((True, True), (False, True))
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 5ebcb4ea81b23b60dc46bae78bfa792f4a8ab6d8..5e6d22470978d97c5e73640e86d3f8b82cbc1b60 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -141,7 +141,7 @@ def serialize_sparse_tensors(tensors):
     tensors: a tensor structure to serialize.
 
   Returns:
-    `tensors` with any sparse tensors replaced by the their serialized version.
+    `tensors` with any sparse tensors replaced by their serialized version.
   """
 
   ret = nest.pack_sequence_as(tensors, [
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index d4aea76d652e7606939f3d8a89ff0378da0774d2..e0759a8bc1ab271906fc4ec75b55529f8a0d2b74 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -535,7 +535,7 @@ def numeric_summary(tensor):
   if not isinstance(tensor, np.ndarray) or not np.size(tensor):
     return debugger_cli_common.RichTextLines([
         "No numeric summary available due to empty tensor."])
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     counts = [
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 704dbda357d1208d0663da41eb7aef4b299dedb8..3821b393ec6847db71b7c4b7396b1ed448ae9538 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -44,6 +44,10 @@ def main(_):
   sess.run(tf.global_variables_initializer())
 
   # Wrap the TensorFlow Session object for debugging.
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 
@@ -52,6 +56,9 @@ def main(_):
 
     sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
     sess.add_tensor_filter("has_negative", has_negative)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   print("Fibonacci number at position %d:\n%s" %
         (FLAGS.length, sess.run(n1)))
@@ -82,7 +89,15 @@ if __name__ == "__main__":
       "--debug",
       dest="debug",
       action="store_true",
-      help="Use TensorFlow Debugger (tfdbg).")
+      help="Use TensorFlow Debugger (tfdbg). Mutually exclusive with the "
+      "--tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
 
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index 0a6dbf311d8e7a0377363d74b57ef2b1d7d00e1d..ab1c90371cd18bbaf278b72248bcc7e9e9c34b06 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -120,8 +120,15 @@ def main(_):
 
   sess.run(tf.global_variables_initializer())
 
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   # Add this point, sess is a debug wrapper around the actual Session if
   # FLAGS.debug is true. In that case, calling run() will launch the CLI.
@@ -173,6 +180,14 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 92314d8dd9f64f48ffe0bc921f99a4661c4c0e93..4f4666ee4fa51ef085d31ee8396dffaf9e38f49e 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -110,10 +110,16 @@ def main(_):
       model_dir=model_dir)
 
   hooks = None
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
                                             dump_root=FLAGS.dump_root)
-    hooks = [debug_hook]
+  elif FLAGS.tensorboard_debug_address:
+    debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)
+  hooks = [debug_hook]
 
   if not FLAGS.use_experiment:
     # Fit model.
@@ -185,11 +191,19 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
   parser.add_argument(
       "--dump_root",
       type=str,
       default="",
       help="Optional custom root directory for temporary debug dump data")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index c4b13a1045dac4966b0e841155a2932216881d34..8d355aa27f6fa10a1889420a9087800be12a81ce 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -222,7 +222,7 @@ def has_inf_or_nan(datum, tensor):
     # Also return False for data types that cannot be represented as numpy
     # arrays.
     return False
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 6fd89e018aa3b2a21dad4b56a4aa1a5b01a1d69d..c1e9869d978e4f5ddfd3cd5f1abd7f5c97b7ca88 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -22,6 +22,7 @@ import shutil
 import tempfile
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_gradients
@@ -38,8 +39,12 @@ from tensorflow.python.training import gradient_descent
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self.sess = session.Session()
-    with self.sess:
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
+    with self.sess.as_default():
       self.u = variables.Variable(2.0, name="u")
       self.v = variables.Variable(3.0, name="v")
       self.w = math_ops.multiply(self.u.value(), self.v.value(), name="w")
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index 367b3535450ac4bd17d4c5dba0eaf149aa4b68b3..b623ee31c5dc59894373ec7952e53acd0f6e1126 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -54,7 +54,8 @@ from tensorflow.python.training import monitored_session
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
       disable_model_pruning=True,
-      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index acea9433e22203d56f4ceb6cd92b681e35876a09..254201c39371e2034b08fad927e98418c8086ea5 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -389,6 +389,11 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         r"mode\."):
       sess.invoke_node_stepper(node_stepper)
 
+  def testDumpingWrapperWithEmptyFetchWorks(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    sess.run([])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 909150eb6aa21b45af39f7cbfd6248c701ae1fb5..c530204bbf6959f56a72c6e67add91f1e575f067 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -121,7 +121,9 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.debug.lib import stepper
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import monitored_session
+from tensorflow.python.util import nest
 
 
 # Helper function.
@@ -439,7 +441,12 @@ class BaseDebugWrapperSession(session.SessionInterface):
             "callable_runner and fetches/feed_dict are mutually exclusive, but "
             "are used simultaneously.")
 
-    if self._is_disabled_thread():
+    empty_fetches = not nest.flatten(fetches)
+    if empty_fetches:
+      tf_logging.info(
+          "Due to empty fetches, tfdbg Session wrapper is letting a "
+          "Session.run pass through without any debugging actions.")
+    if self._is_disabled_thread() or empty_fetches:
       if callable_runner:
         return callable_runner(*callable_runner_args)
       else:
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 989ad801e53615f7bd26b8b4fb850b8a56cd193c..0204254ccab109f4844f077df78902872d1156d5 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -35,10 +35,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self,
-               ui_type="curses",
-               dump_root=None,
-               thread_name_filter=None):
+  def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -62,7 +59,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
     """Add a tensor filter.
 
     See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-    Override default behavior to accommodate the possibility of this method being
+    Override default behavior to accommodate the possibility of this method
+    being
     called prior to the initialization of the underlying
     `LocalCLIDebugWrapperSession` object.
 
@@ -137,9 +135,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
       # pylint: enable=protected-access
 
       with stepper.NodeStepper(
-          run_context.session,
-          run_context.original_args.
-          fetches,
+          run_context.session, run_context.original_args.fetches,
           run_context.original_args.feed_dict) as node_stepper:
         self._session_wrapper.invoke_node_stepper(
             node_stepper, restore_variable_values_on_exit=True)
@@ -149,8 +145,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   def after_run(self, run_context, run_values):
     # Adapt run_context and run_values to OnRunEndRequest and invoke superclass
     # on_run_end()
-    on_run_end_request = framework.OnRunEndRequest(
-        self._performed_action, run_values.run_metadata)
+    on_run_end_request = framework.OnRunEndRequest(self._performed_action,
+                                                   run_values.run_metadata)
     self._session_wrapper.on_run_end(on_run_end_request)
 
 
@@ -260,8 +256,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
     self._thread_name_filter = thread_name_filter
     self._grpc_debug_server_addresses = (
         grpc_debug_server_addresses
-        if isinstance(grpc_debug_server_addresses, list)
-        else [grpc_debug_server_addresses])
+        if isinstance(grpc_debug_server_addresses, list) else
+        [grpc_debug_server_addresses])
 
     self._watch_fn = watch_fn
     self._log_usage = log_usage
@@ -334,6 +330,7 @@ class TensorBoardDebugHook(GrpcDebugHook):
       log_usage: Whether the usage of this class is to be logged (if
         applicable).
     """
+
     def _gated_grpc_watch_fn(fetches, feeds):
       del fetches, feeds  # Unused.
       return framework.WatchOptions(
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 770a496aa9d2f4bb8bee0f51526ba8c3d4278b81..490812c96d83791cdc20c56f16c968f1a1851af8 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -664,6 +664,20 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         [["run"], ["run"]], monitored_sess)
     self.assertFalse(wrapped_monitored_sess.should_stop())
 
+  def testRunsWithEmptyFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run([])
+    self.assertEqual([], run_output)
+
+  def testRunsWithEmptyNestedFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run({"foo": {"baz": []}, "bar": ()})
+    self.assertEqual({"foo": {"baz": []}, "bar": ()}, run_output)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 9e3382d4f301529cd2b476bc76efe7dfd2be9298..ab81d40148476735492890f608315b19eaa0a33f 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -206,29 +206,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "python_eager_op_gen_main",
-    srcs = [
-        "python_eager_op_gen_main.cc",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":python_eager_op_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_cc_binary(
-    name = "python_eager_op_gen_demo",
-    deps = [
-        ":python_eager_op_gen_main",
-        "//tensorflow/core:ops",
-    ],
-)
-
 py_library(
     name = "custom_gradient",
     srcs = ["custom_gradient.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index a2a3e230bbb4232fe916c658a6b0ac8d6d33658d..d79d1fc0a6400a894293f3254d5cac5a10661e13 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -734,7 +734,7 @@ def _num_elements(grad):
   raise ValueError("`grad` not a Tensor or IndexedSlices.")
 
 
-_last_shape_dtype = [None, None]
+_last_zero_shape_dtype = [None, None]
 _last_zero = [None]
 
 
@@ -748,13 +748,15 @@ def _zeros(shape, dtype):
     # TODO(apassos): need to save enough information about variant tensors to do
     # a zeros
     return None
-  if [shape, dtype] != _last_shape_dtype:
-    _last_shape_dtype[:] = [shape, dtype]
+  if [shape, dtype] != _last_zero_shape_dtype:
+    _last_zero_shape_dtype[:] = [shape, dtype]
     _last_zero[0] = _fast_fill(0, shape, dtype)
   return _last_zero[0]
 
 
 def _ones(shape, dtype):
+  if shape == ():  # pylint: disable=g-explicit-bool-comparison
+    return constant_op.constant(1, dtype=dtype)
   return _fast_fill(1, shape, dtype)
 
 
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 2f1654dda499583fe4766cbe2e330399defc96fd..988442c971f7bf978f1848278fd4955d79428fc5 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -153,7 +153,7 @@ def inf_nan_callback(op_type,
       continue
 
     numpy_dtype = output.dtype.as_numpy_dtype
-    if (np.issubdtype(numpy_dtype, np.float) or
+    if (np.issubdtype(numpy_dtype, np.floating) or
         np.issubdtype(numpy_dtype, np.complex) or
         np.issubdtype(numpy_dtype, np.integer)):
       try:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 81b1f6f12a1899ddccb711a81122905bfd363748..f5d0759bdcb6ed3955a53188fa3147eb12659a13 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -292,6 +292,22 @@ def _map_sequence_obj_to_idx(sequence):
   return {id(x): i for i, x in enumerate(sequence)}
 
 
+def _flatten(sequence):
+  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
+  # TODO(akshayka): Support `SparseTensor` in a similar fashion.
+  flat_sequence = nest.flatten(sequence)
+  outputs = []
+  for item in flat_sequence:
+    if isinstance(item, ops.IndexedSlices):
+      if item.dense_shape is not None:
+        outputs.extend([item.values, item.indices, item.dense_shape])
+      else:
+        outputs.extend([item.values, item.indices])
+    else:
+      outputs.append(item)
+  return outputs
+
+
 class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
 
@@ -333,14 +349,14 @@ class GraphModeFunction(object):
     self._input_placeholders = input_placeholders
     self._extra_inputs = list(extra_inputs)
     self._graph = graph
-    self._has_backprop = False
+    self._backward_function = None
     self._func_name = name
     self._function_def = defined_function
     self._num_outputs = len(defined_function.signature.output_arg)
     self._ops = operations
     self._func_outputs = func_outputs
     self._returns = [func_outputs] if isinstance(
-        func_outputs, (ops.Tensor, type(None))) else list(func_outputs)
+        func_outputs, (ops.Tensor, type(None))) else _flatten(func_outputs)
     self._output_shapes = output_shapes
     self._variables = variables if variables is not None else []
 
@@ -348,9 +364,8 @@ class GraphModeFunction(object):
   def variables(self):
     return self._variables
 
-  def _compute_backprop(self):
-    """Computes the backprop function object for this function."""
-    self._has_backprop = True
+  def _construct_backprop_function(self):
+    """Constructs the backprop function object for this function."""
     with self._graph.as_default(), context.graph_mode():
       c = _CapturingContext()
       with c:
@@ -361,13 +376,16 @@ class GraphModeFunction(object):
             filtered_outputs,
             self._input_placeholders,
             grad_ys=self._out_grad_placeholders)
-        shapes = tuple(x.shape for x in in_gradients if x is not None)
+
+    backward_outputs = tuple(
+        grad for grad in _flatten(in_gradients) if grad is not None)
+    output_shapes = tuple(grad.shape for grad in backward_outputs)
+
     captures = list(sorted(c.captured_tensors, key=lambda x: x.name))
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
         filtered_outputs + captures)
-    backward_outputs = tuple(x for x in in_gradients if x is not None)
     all_inputs = self._out_grad_placeholders + captures
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
@@ -381,7 +399,7 @@ class GraphModeFunction(object):
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
         bname, all_inputs, [], self._graph, function_def_ops,
-        backward_outputs, in_gradients, shapes)
+        backward_outputs, in_gradients, output_shapes)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
@@ -426,9 +444,24 @@ class GraphModeFunction(object):
 
   @property
   def output_shapes(self):
+    """The function's output shapes."""
     # TODO(ebrevdo): Should we only keep the output shapes associated
     # with len(self._returns) outputs?
-    return nest.pack_sequence_as(self._func_outputs, self._output_shapes)
+    outputs_list = nest.flatten(self._func_outputs)
+    j = 0
+    for i, o in enumerate(outputs_list):
+      if o is not None:
+        if isinstance(o, ops.IndexedSlices):
+          # Extract the shape of the `IndexedSlices` object's `values` field.
+          outputs_list[i] = self._output_shapes[j]  # the `values` shape
+          if o.dense_shape is not None:
+            j += 3  # skip over shapes for `values`, `indices`, `dense_shape`
+          else:
+            j += 2  # skip over shapes for `values`, `indices`
+        else:
+          outputs_list[i] = self._output_shapes[j]
+          j += 1
+    return nest.pack_sequence_as(self._func_outputs, outputs_list)
 
   @property
   def output_dtypes(self):
@@ -457,12 +490,11 @@ class GraphModeFunction(object):
       if v._trainable:  # pylint: disable=protected-access
         tape.watch_variable(v)
 
-    tensor_inputs = [x for x in nest.flatten(args)
-                     if isinstance(x, ops.Tensor)]
+    tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
     if tape.should_record(tensor_inputs) or tape.should_record(
         self._extra_inputs):
-      if not self._has_backprop:
-        self._compute_backprop()
+      if self._backward_function is None:
+        self._construct_backprop_function()
       return self._backprop_call(tensor_inputs)
 
     ctx = context.context()
@@ -503,13 +535,30 @@ class GraphModeFunction(object):
     """
     if self._func_outputs is None:
       return None
+    # Use `nest.flatten` instead of `_flatten` in order to preserve any
+    # IndexedSlices in `self._func_outputs`.
     outputs_list = nest.flatten(self._func_outputs)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
-        outputs_list[i] = result[j]
-        j += 1
-    return nest.pack_sequence_as(self._func_outputs, outputs_list)
+        if isinstance(o, ops.IndexedSlices):
+          # Repack Tensors for IndexedSlices.
+          if o.dense_shape is not None:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1],
+                dense_shape=result[j + 2])
+            j += 3
+          else:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1])
+            j += 2
+        else:
+          outputs_list[i] = result[j]
+          j += 1
+    ret = nest.pack_sequence_as(self._func_outputs, outputs_list)
+    return ret
 
 
 def _get_defun_inputs(args):
@@ -555,7 +604,7 @@ def _defun_internal(name, func, args, kwds):
 
         # Returning a closed-over tensor as an output does not trigger a
         # call to convert_to_tensor, so we manually capture all such tensors.
-        outputs_list = nest.flatten(func_outputs)
+        outputs_list = _flatten(func_outputs)
         func_def_outputs = [
             _convert_to_graph_tensor(x) for x in outputs_list if x is not None
         ]
@@ -600,6 +649,18 @@ def _cache_key(x):
   """Cache key for tfe functions."""
   if isinstance(x, ops.Tensor):
     return _TensorDtype(x.dtype, x._shape_tuple())  # pylint: disable=protected-access
+  if isinstance(x, ops.IndexedSlices):
+    if x.dense_shape is not None:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.dense_shape.dtype, x.dense_shape._shape_tuple())  # pylint: disable=protected-access
+      ])
+    else:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple())  # pylint: disable=protected-access
+      ])
   if isinstance(x, np.ndarray):
     return ("array", x.shape, tuple(x.reshape(-1)))
   if isinstance(x, (list, tuple)):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 0babc29f17b21ee663cdd5bd170875247353e70b..3e8e67ac7e242887e1c4f7d89a2e2bc395db22fe 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -374,6 +374,78 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
 
+  def testGradientOfGatherWithDefun(self):
+
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testReturningIndexedSlicesWithDefun(self):
+
+    def validate(indexed_slice):
+      def f():
+        return indexed_slice
+
+      output = function.defun(f)()
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(indexed_slice.values, output.values)
+      self.assertAllEqual(indexed_slice.indices, output.indices)
+      self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape)
+
+      self.assertEqual(
+          function.make_defun_op(f).output_shapes, indexed_slice.values.shape)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=constant_op.constant([2]))
+    validate(arg)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=None)
+    validate(arg)
+
+  def testIndexedSliceAsArgumentWithDefun(self):
+
+    @function.defun
+    def f(indexed_slice):
+      return indexed_slice
+
+    def validate(arg):
+      output = f(arg)
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(arg.values, output.values)
+      self.assertAllEqual(arg.indices, output.indices)
+      self.assertAllEqual(arg.dense_shape, output.dense_shape)
+
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=constant_op.constant([1]))
+    validate(indexed_slice)
+
+    # Test that `f` works even when `dense_shape` is None.
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=None)
+    validate(indexed_slice)
+
   def testFunctionOnDevice(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -504,6 +576,24 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(ret[0][2], 10)
     self.assertAllEqual(ret[1], 15)
 
+  def testVariableNamesRespectNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope('foo'):
+        v = resource_variable_ops.ResourceVariable(0.0, name='bar')
+      self.assertEqual(v.name, 'foo/bar:0')
+    create_variable()
+
+  def testVariableNamesRespectNameScopesWithDefunInGraph(self):
+    with context.graph_mode():
+      @function.defun
+      def create_variable():
+        with ops.name_scope('foo'):
+          v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
+        self.assertEqual(v.name, 'foo/bar:0')
+      with ops.get_default_graph().as_default():
+        create_variable()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/gen_op.bzl b/tensorflow/python/eager/gen_op.bzl
deleted file mode 100644
index 8bc1d6c10a60b89a026cb34dbf6fd98d29e909c2..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/gen_op.bzl
+++ /dev/null
@@ -1,65 +0,0 @@
-"""For eager-mode Python."""
-
-load("//tensorflow:tensorflow.bzl",
-     "clean_dep",
-     "tf_binary_additional_srcs",
-     "tf_copts",
-     "tf_cc_binary")
-
-def tfe_gen_op_wrapper_py(name,
-                          out=None,
-                          visibility=None,
-                          deps=[],
-                          generated_target_name=None,
-                          # ApiDefs will be loaded in the order specified in this list.
-                          api_def_srcs=[]):
-  """Generate an eager-mode Python op wrapper for an op library."""
-  # Construct a cc_binary containing the specified ops.
-  tool_name = "gen_" + name + "_py_wrappers_cc"
-  if not deps:
-    deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
-  tf_cc_binary(
-      name=tool_name,
-      linkopts=["-lm"],
-      copts=tf_copts(),
-      linkstatic=1,
-      deps=([
-          clean_dep("//tensorflow/python/eager:python_eager_op_gen_main")
-      ] + deps),
-      visibility=[clean_dep("//visibility:public")],)
-
-  # Invoke the previous cc_binary to generate a python file.
-  if not out:
-    out = "gen_" + name + ".py"
-
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
-
-  native.genrule(
-      name=name + "_pygenrule",
-      outs=[out],
-      srcs=api_def_srcs,
-      tools=[tool_name] + tf_binary_additional_srcs(),
-      cmd=("$(location " + tool_name + ") " + api_def_args_str + " > $@"))
-
-  # Make a py_library out of the generated python file.
-  if not generated_target_name:
-    generated_target_name = name
-  native.py_library(
-      name=generated_target_name,
-      srcs=[out],
-      srcs_version="PY2AND3",
-      visibility=visibility,
-      deps=[
-          clean_dep("//tensorflow/python/eager:framework_for_generated_wrappers"),
-      ],)
diff --git a/tensorflow/python/eager/python_eager_op_gen_main.cc b/tensorflow/python/eager/python_eager_op_gen_main.cc
deleted file mode 100644
index 05351bd8b115ae07482b82166974e86758bc7712..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/python_eager_op_gen_main.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/python/eager/python_eager_op_gen.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-
-namespace tensorflow {
-namespace {
-
-void PrintAllPythonOps(const std::vector<string>& hidden_ops,
-                       const std::vector<string>& api_def_dirs) {
-  OpList ops;
-  OpRegistry::Global()->Export(false, &ops);
-
-  ApiDefMap api_def_map(ops);
-  if (!api_def_dirs.empty()) {
-    Env* env = Env::Default();
-
-    for (const auto& api_def_dir : api_def_dirs) {
-      std::vector<string> api_files;
-      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
-                                        &api_files));
-      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
-    }
-    api_def_map.UpdateDocs();
-  }
-
-  PrintEagerPythonOps(ops, api_def_map, hidden_ops, true /* require_shapes */);
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char* argv[]) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  // Usage:
-  //   python_eager_op_gen_main api_def_dir1,api_def_dir2,...
-  if (argc == 1) {
-    tensorflow::PrintAllPythonOps({}, {});
-  } else if (argc == 2) {
-    const std::vector<tensorflow::string> api_def_dirs =
-        tensorflow::str_util::Split(argv[1], ",",
-                                    tensorflow::str_util::SkipEmpty());
-    tensorflow::PrintAllPythonOps({}, api_def_dirs);
-  } else {
-    return -1;
-  }
-  return 0;
-}
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 6162644036998bfaa97ac4a37680b661d844ff7a..d927f3abedb88deddabd4c4d931d12053005a3ff 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -86,30 +86,6 @@ bool ParseBoolValue(const string& key, PyObject* py_value, TF_Status* status,
   return true;
 }
 
-const char* ParseProtoValue(const string& key, const char* proto_name,
-                            PyObject* py_value, size_t* size,
-                            TF_Status* status) {
-  char* output = nullptr;
-  Py_ssize_t py_size;
-  if (PyBytes_Check(py_value) &&
-      PyBytes_AsStringAndSize(py_value, &output, &py_size) >= 0) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
-#if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(py_value) &&
-      (output = PyUnicode_AsUTF8AndSize(py_value, &py_size)) != nullptr) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
-#endif
-  TF_SetStatus(status, TF_INVALID_ARGUMENT,
-               tensorflow::strings::StrCat("Expecting a string (serialized ",
-                                           proto_name, ") value for attr ", key)
-                   .c_str());
-  return nullptr;
-}
-
 bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
                    TF_AttrType type, TF_Status* status) {
   if (!PySequence_Check(py_list)) {
@@ -329,8 +305,9 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
 tensorflow::mutex exception_class_mutex(tensorflow::LINKER_INITIALIZED);
 PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
 
-static tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
-static tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
+tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
+tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
+
 }  // namespace
 
 void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
@@ -551,6 +528,34 @@ tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* GetTapeSet() {
   return tape_set;
 }
 
+// A safe copy of the current tapeset. Does not get affected by other python
+// threads changing the set of active tapes.
+class SafeTapeSet {
+ public:
+  SafeTapeSet() : tape_set_(*GetTapeSet()) {
+    for (auto* tape : tape_set_) {
+      Py_INCREF(tape);
+    }
+  }
+
+  ~SafeTapeSet() {
+    for (auto* tape : tape_set_) {
+      Py_DECREF(tape);
+    }
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator begin() {
+    return tape_set_.begin();
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator end() {
+    return tape_set_.end();
+  }
+
+ private:
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*> tape_set_;
+};
+
 // xcode 7 doesn't define thread_local, so for compatibility we implement our
 // own. TODO(apassos) remove once we can deprecate xcode 7.
 #ifndef __APPLE__
@@ -741,10 +746,7 @@ void TFE_Py_TapeSetWatchVariable(PyObject* variable) {
   if (*ThreadTapeIsStopped()) {
     return;
   }
-  // Note: making a copy because watching a variable can trigger a change to the
-  // set of tapes by allowing python's garbage collector to run.
-  auto tape_set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : tape_set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->WatchVariable(variable);
   }
 }
@@ -766,6 +768,9 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
     return;
   }
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
+  if (PyErr_Occurred()) {
+    return;
+  }
   std::vector<tensorflow::eager::TapeTensor> output_info;
   PyObject* seq = PySequence_Fast(output_tensors,
                                   "expected a sequence of integer tensor ids");
@@ -797,8 +802,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
     return;
   }
 
-  auto set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     Py_INCREF(backward_function);
     tape->tape->RecordOperation(
         op_type_str, output_info, input_ids, backward_function,
@@ -807,10 +811,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
 }
 
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
-  // Note: making a copy because deleting the trace can trigger a change to the
-  // set of tapes by allowing python's garbage collector to run.
-  auto tape_set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : tape_set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->DeleteTrace(tensor_id);
   }
 }
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 41f55b12af893e3207ad1ffa45098d12b1c4fff6..c519fd557a9319d6ef5522b26198e5b4202917fc 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -604,6 +604,7 @@ py_library(
         ":metric_keys",
         ":model_fn",
         ":prediction_keys",
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 96e4ecd29fbcd4f4335077e9f81c5704ae2b9bec..138152ac1c6b2d7e399218208dd7bdf2d8136f5e 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -57,6 +57,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.training import training_util
 
 # The default learning rate of 0.3 is a historical artifact of the initial
@@ -220,7 +221,8 @@ class BaselineClassifier(estimator.Estimator):
                weight_column=None,
                label_vocabulary=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineClassifier instance.
 
     Args:
@@ -240,6 +242,8 @@ class BaselineClassifier(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineClassifier` estimator.
 
@@ -249,11 +253,13 @@ class BaselineClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
@@ -311,7 +317,8 @@ class BaselineRegressor(estimator.Estimator):
                label_dimension=1,
                weight_column=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineRegressor instance.
 
     Args:
@@ -328,13 +335,16 @@ class BaselineRegressor(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineRegressor` estimator.
     """
 
     head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
         label_dimension=label_dimension,
-        weight_column=weight_column)
+        weight_column=weight_column,
+        loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index ba96d738ae6c0bf6c880b66d4dc72680be5c7e91..0f274a23c03426fc431c15ac0a14617a4a65bb79 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -22,7 +22,6 @@ import six
 
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.feature_column import feature_column as feature_column_lib
@@ -340,8 +339,8 @@ class DNNClassifier(estimator.Estimator):
           loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _dnn_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_model_fn(
+      """Call the defined shared _dnn_model_fn."""
+      return _dnn_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -353,17 +352,10 @@ class DNNClassifier(estimator.Estimator):
           dropout=dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
 
 
 class DNNRegressor(estimator.Estimator):
@@ -490,8 +482,8 @@ class DNNRegressor(estimator.Estimator):
     """
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _dnn_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_model_fn(
+      """Call the defined shared _dnn_model_fn."""
+      return _dnn_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -506,14 +498,7 @@ class DNNRegressor(estimator.Estimator):
           dropout=dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index d29c892662adcb7df647243f572263ae24b014ee..1a0f4c5c3931a6b41026470f30e7bdd381e5b37a 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -23,7 +23,6 @@ import math
 import six
 
 from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import dnn
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import linear
@@ -385,8 +384,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
           loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the _dnn_linear_combined_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_linear_combined_model_fn(
+      """Call the _dnn_linear_combined_model_fn."""
+      return _dnn_linear_combined_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -400,17 +399,10 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
           dnn_dropout=dnn_dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNLinearCombinedClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
 
 
 class DNNLinearCombinedRegressor(estimator.Estimator):
@@ -554,8 +546,8 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
                        'must be defined.')
 
     def _model_fn(features, labels, mode, config):
-      """Call the _dnn_linear_combined_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_linear_combined_model_fn(
+      """Call the _dnn_linear_combined_model_fn."""
+      return _dnn_linear_combined_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -572,14 +564,7 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
           dnn_dropout=dnn_dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNLinearCombinedRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 94a5d3a342dd7bad49d5fb4b91166c67a2705ff3..cb9e3fc6ca116ac0f48a37cea92fa4119754f324 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -24,6 +24,7 @@ import collections
 import six
 
 from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export_output
@@ -371,6 +372,64 @@ def _check_logits_final_dim(logits, expected_logits_dimension):
         return array_ops.identity(logits, name=scope)
 
 
+def _validate_loss_fn_args(loss_fn):
+  """Validates loss_fn arguments.
+
+  Required arguments: labels, logits.
+  Optional arguments: features.
+
+  Args:
+    loss_fn: The loss function.
+  Raises:
+    ValueError: If the signature is unexpected.
+  """
+  loss_fn_args = util.fn_args(loss_fn)
+  for required_arg in ['labels', 'logits']:
+    if required_arg not in loss_fn_args:
+      raise ValueError(
+          'loss_fn must contain argument: {}. '
+          'Given arguments: {}'.format(required_arg, loss_fn_args))
+  invalid_args = list(set(loss_fn_args) - set(['labels', 'logits', 'features']))
+  if invalid_args:
+    raise ValueError('loss_fn has unexpected args: {}'.format(invalid_args))
+
+
+def _call_loss_fn(loss_fn, labels, logits, features, expected_loss_dim=1):
+  """Calls loss_fn and checks the returned shape.
+
+  Args:
+    loss_fn: The loss function.
+    labels: Processed labels Tensor.
+    logits: Logits Tensor of shape [D0, D1, ... DN, logits_dimension].
+    features: Features dict.
+    expected_loss_dim: The expected last dimension of loss Tensor.
+  Returns:
+    Loss Tensor with shape [D0, D1, ... DN, expected_loss_dim].
+  """
+  loss_fn_args = util.fn_args(loss_fn)
+  kwargs = {}
+  if 'features' in loss_fn_args:
+    kwargs['features'] = features
+  with ops.name_scope(
+      None, 'call_loss_fn',
+      values=[labels, logits] + list(six.itervalues(features))):
+    unweighted_loss = loss_fn(labels=labels, logits=logits, **kwargs)
+    logits_shape = array_ops.shape(logits, name='logits_shape')
+    expected_loss_shape = array_ops.concat(
+        [logits_shape[:-1], [expected_loss_dim]], axis=0,
+        name='expected_loss_shape')
+    loss_shape = array_ops.shape(unweighted_loss, name='loss_shape')
+    check_loss_shape_op = control_flow_ops.Assert(
+        math_ops.reduce_all(math_ops.equal(loss_shape, expected_loss_shape)),
+        data=[
+            'loss_fn must return Tensor of shape '
+            '[D0, D1, ... DN, {}]. '.format(expected_loss_dim),
+            'logits_shape: ', logits_shape, 'loss_shape: ', loss_shape],
+        name='check_loss_shape')
+    with ops.control_dependencies([check_loss_shape_op]):
+      return array_ops.identity(unweighted_loss)
+
+
 def _indicator_labels_mean(labels, weights=None, name=None):
   with ops.name_scope(name, 'labels_mean', (labels, weights)) as scope:
     labels = math_ops.to_float(labels, name='labels')
@@ -467,6 +526,7 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
     weight_column=None,
     label_vocabulary=None,
     loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
     name=None):
   """Creates a '_Head' for multi class classification.
 
@@ -485,6 +545,12 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `_BinaryLogisticHeadWithSigmoidCrossEntropyLoss`).
@@ -499,6 +565,7 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -517,11 +584,14 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _MultiClassHeadWithSoftmaxCrossEntropyLoss(
       n_classes=n_classes,
       weight_column=weight_column,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -533,6 +603,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                weight_column=None,
                label_vocabulary=None,
                loss_reduction=losses.Reduction.SUM,
+               loss_fn=None,
                name=None):
     if (n_classes is None) or (n_classes <= 2):
       raise ValueError('n_classes must be > 2: %s.' % n_classes)
@@ -540,6 +611,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     self._weight_column = weight_column
     self._label_vocabulary = label_vocabulary
     self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -602,10 +674,15 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     labels = _check_dense_labels_match_logits_and_reshape(
         labels=labels, logits=logits, expected_labels_dimension=1)
     label_ids = self._label_ids(labels)
-    unweighted_loss = losses.sparse_softmax_cross_entropy(
-        labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
-    # Restore the squeezed dim, so unweighted_loss matches the weights shape.
-    unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=label_ids, logits=logits,
+          features=features, expected_loss_dim=1)
+    else:
+      unweighted_loss = losses.sparse_softmax_cross_entropy(
+          labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
+      # Restore the squeezed dim, so unweighted_loss matches the weights shape.
+      unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
@@ -734,8 +811,12 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
 
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
-    weight_column=None, thresholds=None, label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM, name=None):
+    weight_column=None,
+    thresholds=None,
+    label_vocabulary=None,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a `_Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
@@ -755,6 +836,12 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -772,6 +859,7 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -795,11 +883,14 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(
       weight_column=weight_column,
       thresholds=thresholds,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -811,11 +902,13 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                thresholds=None,
                label_vocabulary=None,
                loss_reduction=losses.Reduction.SUM,
+               loss_fn=None,
                name=None):
     self._weight_column = weight_column
     self._thresholds = thresholds
     self._label_vocabulary = label_vocabulary
     self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -916,8 +1009,13 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
           name='class_id_lookup').lookup(labels)
     labels = math_ops.to_float(labels)
     labels = _assert_range(labels, 2)
-    unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
-        labels=labels, logits=logits)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=labels, logits=logits,
+          features=features, expected_loss_dim=1)
+    else:
+      unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
+          labels=labels, logits=logits)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
@@ -1057,6 +1155,7 @@ def _regression_head_with_mean_squared_error_loss(
     weight_column=None,
     label_dimension=1,
     loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
     name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
@@ -1075,6 +1174,10 @@ def _regression_head_with_mean_squared_error_loss(
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, label_dimension]`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -1085,6 +1188,7 @@ def _regression_head_with_mean_squared_error_loss(
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -1097,10 +1201,13 @@ def _regression_head_with_mean_squared_error_loss(
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _RegressionHeadWithMeanSquaredErrorLoss(
       weight_column=weight_column,
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -1112,6 +1219,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
       label_dimension,
       weight_column=None,
       loss_reduction=losses.Reduction.SUM,
+      loss_fn=None,
       name=None):
     """`Head` for regression."""
     if label_dimension < 1:
@@ -1119,6 +1227,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
     self._logits_dimension = label_dimension
     self._weight_column = weight_column
     self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -1137,8 +1246,13 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         labels=labels, logits=logits,
         expected_labels_dimension=self._logits_dimension)
     labels = math_ops.to_float(labels)
-    unweighted_loss = losses.mean_squared_error(
-        labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=labels, logits=logits,
+          features=features, expected_loss_dim=self._logits_dimension)
+    else:
+      unweighted_loss = losses.mean_squared_error(
+          labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits,
         allow_per_logit_weights=True)
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 4e871e8f375f346bfd1b0be2cade97c34871f31c..3a03770af498981a054c3df9155e83a60c7f0350 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -111,6 +111,41 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       head_lib._multi_class_head_with_softmax_cross_entropy_loss(
           n_classes=3, loss_reduction=losses.Reduction.NONE)
 
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+    head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
   def test_invalid_logits_shape(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
@@ -406,6 +441,56 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       self.assertAllClose(
           expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    logits_input = np.array([[-10., 10., 0.], [-15., 10., 0]], dtype=np.float32)
+    labels_input = np.array([[1], [2]], dtype=np.int64)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([1., 2.], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+    logits = np.array([[-10., 10., 0.], [-15., 10., 0.]], dtype=np.float32)
+    labels = np.array([[1], [2]], dtype=np.int64)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 3\] \[loss_shape: \] \[2\]'):
+        actual_training_loss.eval()
+
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -1204,6 +1289,41 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
           loss_reduction=losses.Reduction.NONE)
 
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
   def test_invalid_logits_shape(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
     self.assertEqual(1, head.logits_dimension)
@@ -1699,6 +1819,56 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
       self.assertAllClose(expected_weights, actual_weights)
 
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    logits_input = np.array([[-10.], [10.]], dtype=np.float32)
+    labels_input = np.array([[1], [0]], dtype=np.int64)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([1., 2.], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_fn=_loss_fn)
+
+    logits = np.array([[-10.], [10.]], dtype=np.float32)
+    labels = np.array([[1], [0]], dtype=np.int64)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 1\] \[loss_shape: \] \[2\]'):
+        actual_training_loss.eval()
+
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
@@ -2355,6 +2525,37 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       head_lib._regression_head_with_mean_squared_error_loss(
           loss_reduction=losses.Reduction.NONE)
 
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
   def test_invalid_logits(self):
     head = head_lib._regression_head_with_mean_squared_error_loss(
         label_dimension=3)
@@ -2530,6 +2731,56 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       # loss = [(43-45)^2, (44-41)] = [4, 9]
       self.assertAllClose(13., training_loss.eval())
 
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[0., 1.], [2., 3.]], dtype=np.float32)
+    logits_input = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
+    labels_input = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=2, loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=2, loss_fn=_loss_fn)
+
+    logits = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
+    labels = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 2\]\. \] '
+          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2 1\]'):
+        actual_training_loss.eval()
+
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
     head = head_lib._regression_head_with_mean_squared_error_loss()
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 7a80dfacc2b2c478d1417e173ec6212569e93da8..a5b1172e729240a2ea02fa1d4330420786c2686c 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -23,7 +23,6 @@ import math
 import six
 
 from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.feature_column import feature_column as feature_column_lib
@@ -305,8 +304,8 @@ class LinearClassifier(estimator.Estimator):
           loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _linear_model_fn and possibly warm-start."""
-      estimator_spec = _linear_model_fn(
+      """Call the defined shared _linear_model_fn."""
+      return _linear_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -315,19 +314,12 @@ class LinearClassifier(estimator.Estimator):
           optimizer=optimizer,
           partitioner=partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(LinearClassifier, self).__init__(
         model_fn=_model_fn,
         model_dir=model_dir,
-        config=config)
+        config=config,
+        warm_start_from=warm_start_from)
 
 
 class LinearRegressor(estimator.Estimator):
@@ -432,8 +424,8 @@ class LinearRegressor(estimator.Estimator):
         loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _linear_model_fn and possibly warm-start."""
-      estimator_spec = _linear_model_fn(
+      """Call the defined shared _linear_model_fn."""
+      return _linear_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -442,16 +434,9 @@ class LinearRegressor(estimator.Estimator):
           optimizer=optimizer,
           partitioner=partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(LinearRegressor, self).__init__(
         model_fn=_model_fn,
         model_dir=model_dir,
-        config=config)
+        config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 5da34a85dbc5bddb9de24216a8f00940459aa5d1..78d74b63d3ebea29e2dc3ab8f655efcc7ab8e130 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -35,6 +35,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import util
+from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.export.export import build_all_signature_defs
 from tensorflow.python.estimator.export.export import get_temp_export_dir
 from tensorflow.python.estimator.export.export import get_timestamped_export_dir
@@ -97,9 +98,22 @@ class Estimator(object):
   @end_compatibility
   """
 
-  def __init__(self, model_fn, model_dir=None, config=None, params=None):
+  def __init__(self, model_fn, model_dir=None, config=None, params=None,
+               warm_start_from=None):
     """Constructs an `Estimator` instance.
 
+    See @{$estimators} for more information. To warm-start an `Estimator`:
+
+    ```python
+    estimator = tf.estimator.DNNClassifier(
+        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
+        hidden_units=[1024, 512, 256],
+        warm_start_from="/path/to/checkpoint/dir")
+    ```
+
+    For more details on warm-start configuration, see
+    @{tf.estimator.WarmStartSettings$WarmStartSettings}.
+
     Args:
       model_fn: Model function. Follows the signature:
 
@@ -136,6 +150,12 @@ class Estimator(object):
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
+      warm_start_from: Optional string filepath to a checkpoint to warm-start
+                       from, or a `tf.estimator.WarmStartSettings` object to
+                       fully configure warm-starting.  If the string filepath is
+                       provided instead of a `WarmStartSettings`, then all
+                       variables are warm-started, and it is assumed that
+                       vocabularies and Tensor names are unchanged.
 
     Raises:
       RuntimeError: If eager execution is enabled.
@@ -193,6 +213,11 @@ class Estimator(object):
     self._model_fn = model_fn
     self._params = copy.deepcopy(params or {})
 
+    # pylint: disable=protected-access
+    self._warm_start_settings = (
+        warm_starting_util._get_default_warm_start_settings(warm_start_from))
+    # pylint: enable=protected-access
+
   @property
   def model_dir(self):
     return self._model_dir
@@ -453,13 +478,16 @@ class Estimator(object):
       estimator_spec = self._call_model_fn(
           features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
       predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
+      all_hooks = list(input_hooks)
+      all_hooks.extend(hooks)
+      all_hooks.extend(list(estimator_spec.prediction_hooks or []))
       with training.MonitoredSession(
           session_creator=training.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
               master=self._config.master,
               scaffold=estimator_spec.scaffold,
               config=self._session_config),
-          hooks=input_hooks + hooks) as mon_sess:
+          hooks=all_hooks) as mon_sess:
         while not mon_sess.should_stop():
           preds_evaluated = mon_sess.run(predictions)
           if not isinstance(predictions, dict):
@@ -782,6 +810,13 @@ class Estimator(object):
       worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+
+      if self._warm_start_settings:
+        logging.info('Warm-starting with WarmStartSettings: %s' %
+                     (self._warm_start_settings,))
+        # pylint: disable=protected-access
+        warm_starting_util._warm_start(self._warm_start_settings)
+        # pylint: enable=protected-access
       # Check if the user created a loss summary, and add one if they didn't.
       # We assume here that the summary is called 'loss'. If it is not, we will
       # make another one with the name 'loss' to ensure it shows up in the right
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index ed1676a92de19203be8bc61fc6efeb559a2fb8aa..39a5b998ebdcccfbeddf0fc96dab44dc91a289fa 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -52,6 +52,7 @@ from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
@@ -629,6 +630,33 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(
         10, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
 
+  def test_warm_starts(self):
+    def _make_model_fn(x):
+      def _variable_creating_model_fn(features, labels, mode):
+        _, _ = features, labels
+        variable_scope.get_variable('x', initializer=x)
+        global_step = training.get_global_step()
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            loss=constant_op.constant(1.),
+            train_op=state_ops.assign_add(global_step, 1))
+      return _variable_creating_model_fn
+
+    est = estimator.Estimator(model_fn=_make_model_fn(42.))
+    est.train(dummy_input_fn, steps=10)
+
+    warm_started_est = estimator.Estimator(
+        model_fn=_make_model_fn(36.),
+        warm_start_from=est.model_dir)
+    warm_started_est.train(dummy_input_fn, steps=5)
+    # warm_start is called after the model_fn, so x should have the value
+    # from the checkpoint.
+    self.assertEqual(42., warm_started_est.get_variable_value('x'))
+    # global_step should not be warm-started.
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(
+            warm_started_est.model_dir))
+
   def test_max_step(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
     est.train(dummy_input_fn, max_steps=5)
@@ -1327,6 +1355,25 @@ class EstimatorPredictTest(test.TestCase):
     est.train(dummy_input_fn, steps=1)
     self.assertEqual(10., next(est.predict(dummy_input_fn)))
 
+  def test_predictionhooks_are_used(self):
+    hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+
+    def _model_fn_hooks(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]),
+          prediction_hooks=[hook])
+
+    est = estimator.Estimator(model_fn=_model_fn_hooks)
+    est.train(dummy_input_fn, steps=1)
+    self.assertFalse(hook.begin.called)
+    next(est.predict(dummy_input_fn))
+    self.assertTrue(hook.begin.called)
+
   def test_warn_if_no_queue_runner(self):
 
     def _model_fn(features, labels, mode):
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 51075731ddc52a55799958c3bfa6140f77404541..83251c79fc561e16ebddb638668b92b3c69b8af4 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -36,12 +36,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 _SINGLE_FEATURE_DEFAULT_NAME = 'feature'
 _SINGLE_RECEIVER_DEFAULT_NAME = 'input'
 
 
+@tf_export('estimator.export.ServingInputReceiver')
 class ServingInputReceiver(collections.namedtuple(
     'ServingInputReceiver',
     ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
@@ -118,6 +120,7 @@ class ServingInputReceiver(collections.namedtuple(
         receiver_tensors_alternatives=receiver_tensors_alternatives)
 
 
+@tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
   """Build a serving_input_receiver_fn expecting fed tf.Examples.
@@ -146,6 +149,7 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
   return serving_input_receiver_fn
 
 
+@tf_export('estimator.export.build_raw_serving_input_receiver_fn')
 def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   """Build a serving_input_receiver_fn expecting feature Tensors.
 
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 863af6d41d985043542b03375372fe564c283b82..87b964be37197dac99b8ce4398cbdaf3b4989c7f 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -26,8 +26,10 @@ import six
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.export.ExportOutput')
 class ExportOutput(object):
   """Represents an output of a model that can be served.
 
@@ -50,6 +52,7 @@ class ExportOutput(object):
     pass
 
 
+@tf_export('estimator.export.ClassificationOutput')
 class ClassificationOutput(ExportOutput):
   """Represents the output of a classification head.
 
@@ -118,6 +121,7 @@ class ClassificationOutput(ExportOutput):
         examples, self.classes, self.scores)
 
 
+@tf_export('estimator.export.RegressionOutput')
 class RegressionOutput(ExportOutput):
   """Represents the output of a regression head."""
 
@@ -153,6 +157,7 @@ class RegressionOutput(ExportOutput):
 _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
 
 
+@tf_export('estimator.export.PredictOutput')
 class PredictOutput(ExportOutput):
   """Represents the output of a generic prediction head.
 
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index c4c2e30e8771c5cb1e492fed751c71583dcf477b..a6f471291008e3c27dea1aeea5865e334f76e5c8 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -24,6 +24,7 @@ import numpy as np
 from six import string_types
 
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 # Key name to pack the target into dict of `features`. See
 # `_get_unique_target_key` for details.
@@ -86,6 +87,7 @@ def _validate_and_convert_features(x):
   return ordered_dict_data
 
 
+@tf_export('estimator.inputs.numpy_input_fn')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 90d6145377d8f931b94793f8a912f77f1620f16e..bd06843021f47f81fc0c22d0fcee43530dc10098 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 try:
   # pylint: disable=g-import-not-at-top
@@ -34,6 +35,7 @@ except ImportError:
   HAS_PANDAS = False
 
 
+@tf_export('estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 75c0e61d47b37110b14aa57f6a185cab822a70bb..8e5d8141a1a15d8cb28aefc0f24c02495337245d 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -47,10 +47,9 @@ except ImportError:
 
 
 def _fill_array(arr, seq, fillvalue=0):
-  """
-  Recursively fills padded arr with elements from seq.
-  If length of seq is less than arr padded length, fillvalue used.
+  """Recursively fills padded arr with elements from seq.
 
+  If length of seq is less than arr padded length, fillvalue used.
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
     seq: Non-padded list of data sampels of shape
@@ -84,28 +83,30 @@ def _pad_if_needed(batch_key_item, fillvalue=0):
   Raises:
     ValueError if data samples have different shapes (except last padded dim).
   """
-  shapes = [seq.shape[:-1] if len(seq.shape) > 0 else -1
-            for seq in batch_key_item]
+  shapes = [
+      seq.shape[:-1] if len(seq.shape) > 0 else -1 for seq in batch_key_item
+  ]
   if not all(shapes[0] == x for x in shapes):
     raise ValueError("Array shapes must match.")
 
-  last_length = [seq.shape[-1] if len(seq.shape) > 0 else 0
-                 for seq in batch_key_item]
+  last_length = [
+      seq.shape[-1] if len(seq.shape) > 0 else 0 for seq in batch_key_item
+  ]
   if all([x == last_length[0] for x in last_length]):
     return batch_key_item
 
   batch_size = len(batch_key_item)
   max_sequence_length = max(last_length)
   result_batch = np.zeros(
-    shape=[batch_size] + list(shapes[0]) + [max_sequence_length],
-    dtype=batch_key_item[0].dtype)
+      shape=[batch_size] + list(shapes[0]) + [max_sequence_length],
+      dtype=batch_key_item[0].dtype)
   _fill_array(result_batch, batch_key_item, fillvalue)
   return result_batch
 
 
-def _get_integer_indices_for_next_batch(
-    batch_indices_start, batch_size, epoch_end, array_length,
-    current_epoch, total_epochs):
+def _get_integer_indices_for_next_batch(batch_indices_start, batch_size,
+                                        epoch_end, array_length, current_epoch,
+                                        total_epochs):
   """Returns the integer indices for next batch.
 
   If total epochs is not None and current epoch is the final epoch, the end
@@ -135,8 +136,9 @@ def _get_integer_indices_for_next_batch(
                                  "Already emitted %s epochs." % current_epoch)
 
   batch_indices_end = batch_indices_start + batch_size
-  batch_indices = [j % array_length for j in
-                   range(batch_indices_start, batch_indices_end)]
+  batch_indices = [
+      j % array_length for j in range(batch_indices_start, batch_indices_end)
+  ]
   epoch_end_indices = [i for i, x in enumerate(batch_indices) if x == epoch_end]
   current_epoch += len(epoch_end_indices)
 
@@ -320,16 +322,20 @@ class _GeneratorFeedFn(object):
           raise KeyError("key mismatch between dicts emitted by GenFun "
                          "Expected {} keys; got {}".format(
                              self._keys, data_row.keys()))
-        list_dict.setdefault(self._col_placeholders[index],
-                             list()).append(data_row[key])
+        list_dict.setdefault(self._col_placeholders[index], list()).append(
+            data_row[key])
         list_dict_size += 1
 
     if self._pad_value is not None:
-      feed_dict = {key: np.asarray(_pad_if_needed(item, self._pad_value))
-                   for key, item in list(list_dict.items())}
+      feed_dict = {
+          key: np.asarray(_pad_if_needed(item, self._pad_value))
+          for key, item in list(list_dict.items())
+      }
     else:
-      feed_dict = {key: np.asarray(item)
-                   for key, item in list(list_dict.items())}
+      feed_dict = {
+          key: np.asarray(item)
+          for key, item in list(list_dict.items())
+      }
     return feed_dict
 
 
@@ -382,9 +388,8 @@ def _enqueue_data(data,
       queue_shapes = [(), data.shape[1:]]
       get_feed_fn = _ArrayFeedFn
     elif isinstance(data, collections.OrderedDict):
-      types = [dtypes.int64] + [
-          dtypes.as_dtype(col.dtype) for col in data.values()
-      ]
+      types = [dtypes.int64
+              ] + [dtypes.as_dtype(col.dtype) for col in data.values()]
       queue_shapes = [()] + [col.shape[1:] for col in data.values()]
       get_feed_fn = _OrderedDictNumpyFeedFn
     elif isinstance(data, tp.FunctionType):
@@ -447,11 +452,11 @@ def _enqueue_data(data,
           seed=seed)
     elif pad_data:
       min_after_dequeue = 0  # just for the summary text
-      queue_shapes = list(map(
-        lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x,
-        queue_shapes))
+      queue_shapes = list(
+          map(lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x,
+              queue_shapes))
       queue = data_flow_ops.PaddingFIFOQueue(
-        capacity, dtypes=types, shapes=queue_shapes)
+          capacity, dtypes=types, shapes=queue_shapes)
     else:
       min_after_dequeue = 0  # just for the summary text
       queue = data_flow_ops.FIFOQueue(
@@ -470,31 +475,35 @@ def _enqueue_data(data,
 
       if not pad_data:
         feed_fns.append(
-          get_feed_fn(
-              placeholders,
-              data,
-              enqueue_size,
-              random_start=shuffle,
-              seed=seed_i,
-              num_epochs=num_epochs))
+            get_feed_fn(
+                placeholders,
+                data,
+                enqueue_size,
+                random_start=shuffle,
+                seed=seed_i,
+                num_epochs=num_epochs))
       else:
         feed_fns.append(
-          get_feed_fn(
-              placeholders,
-              data,
-              enqueue_size,
-              random_start=shuffle,
-              seed=seed_i,
-              num_epochs=num_epochs,
-              pad_value=pad_value))
+            get_feed_fn(
+                placeholders,
+                data,
+                enqueue_size,
+                random_start=shuffle,
+                seed=seed_i,
+                num_epochs=num_epochs,
+                pad_value=pad_value))
 
     runner = fqr._FeedingQueueRunner(  # pylint: disable=protected-access
-        queue=queue, enqueue_ops=enqueue_ops, feed_fns=feed_fns)
+        queue=queue,
+        enqueue_ops=enqueue_ops,
+        feed_fns=feed_fns)
     queue_runner.add_queue_runner(runner)
 
-    full = (math_ops.cast(
-        math_ops.maximum(0, queue.size() - min_after_dequeue),
-        dtypes.float32) * (1. / (capacity - min_after_dequeue)))
+    full = (
+        math_ops.cast(
+            math_ops.maximum(0,
+                             queue.size() - min_after_dequeue), dtypes.float32)
+        * (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
     summary_name = ("queue/%sfraction_over_%d_of_%d_full" %
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index da202408c3680b397994620e221fa4937d7c65e4..b08f83fc569b1bb1ea6e5c93c57be7b5bb96f0a5 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -56,7 +56,7 @@ class EstimatorSpec(
     collections.namedtuple('EstimatorSpec', [
         'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops',
         'export_outputs', 'training_chief_hooks', 'training_hooks', 'scaffold',
-        'evaluation_hooks'
+        'evaluation_hooks', 'prediction_hooks'
     ])):
   """Ops and objects returned from a `model_fn` and passed to an `Estimator`.
 
@@ -73,7 +73,8 @@ class EstimatorSpec(
               training_chief_hooks=None,
               training_hooks=None,
               scaffold=None,
-              evaluation_hooks=None):
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a validated `EstimatorSpec` instance.
 
     Depending on the value of `mode`, different arguments are required. Namely
@@ -154,6 +155,8 @@ class EstimatorSpec(
         initialization, saver, and more to be used in training.
       evaluation_hooks: Iterable of `tf.train.SessionRunHook` objects to
         run during evaluation.
+      prediction_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run during predictions.
 
     Returns:
       A validated `EstimatorSpec` object.
@@ -282,7 +285,10 @@ class EstimatorSpec(
     training_chief_hooks = tuple(training_chief_hooks or [])
     training_hooks = tuple(training_hooks or [])
     evaluation_hooks = tuple(evaluation_hooks or [])
-    for hook in training_hooks + training_chief_hooks + evaluation_hooks:
+    prediction_hooks = tuple(prediction_hooks or [])
+
+    for hook in (training_hooks + training_chief_hooks + evaluation_hooks +
+                 prediction_hooks):
       if not isinstance(hook, session_run_hook.SessionRunHook):
         raise TypeError(
             'All hooks must be SessionRunHook instances, given: {}'.format(
@@ -305,7 +311,8 @@ class EstimatorSpec(
         training_chief_hooks=training_chief_hooks,
         training_hooks=training_hooks,
         scaffold=scaffold,
-        evaluation_hooks=evaluation_hooks)
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def _replace(self, **kwds):
     """Return a new EstimatorSpec replacing specified fields with new values."""
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
index d67c4b716161816d941eef94a4b9aeb0643de55e..b7eeeb437cb4a624cdee552be3032364b18a8290 100644
--- a/tensorflow/python/estimator/model_fn_test.py
+++ b/tensorflow/python/estimator/model_fn_test.py
@@ -72,7 +72,8 @@ class EstimatorSpecTrainTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
 
   def testLossNumber(self):
     """Tests that error is raised when loss is a number (not Tensor)."""
@@ -465,7 +466,17 @@ class EstimatorSpecInferTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
+
+  def testPredictionHookInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, 'All hooks must be SessionRunHook instances'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=constant_op.constant(1.),
+            prediction_hooks=[_InvalidHook()])
 
   def testPredictionsMissing(self):
     with ops.Graph().as_default(), self.test_session():
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 52fb1d39ae2e9c84e4269785a72be4f9a495b73c..2e84c5014f6e17b34b38d2dfe5711b5b654553bb 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Classes and functions related to train_and_evaluate."""
 
 from __future__ import absolute_import
@@ -37,7 +36,6 @@ from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
 
-
 _MAX_DELAY_SECS = 60
 _DELAY_SECS_PER_WORKER = 5
 _TF_CONFIG_ENV = 'TF_CONFIG'
@@ -50,8 +48,7 @@ _TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER,
 def _validate_input_fn(input_fn):
   """Validates the `input_fn`."""
   if not callable(input_fn):
-    raise TypeError(
-        '`input_fn` must be callable, given: {}'.format(input_fn))
+    raise TypeError('`input_fn` must be callable, given: {}'.format(input_fn))
 
 
 def _validate_hooks(hooks):
@@ -125,10 +122,7 @@ class TrainSpec(
   duration. Optional hooks run at various stages of training.
   """
 
-  def __new__(cls,
-              input_fn,
-              max_steps=None,
-              hooks=None):
+  def __new__(cls, input_fn, max_steps=None, hooks=None):
     """Creates a validated `TrainSpec` instance.
 
     Args:
@@ -161,16 +155,13 @@ class TrainSpec(
     hooks = _validate_hooks(hooks)
 
     return super(TrainSpec, cls).__new__(
-        cls,
-        input_fn=input_fn,
-        max_steps=max_steps,
-        hooks=hooks)
+        cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks)
 
 
 class EvalSpec(
     collections.namedtuple('EvalSpec', [
-        'input_fn', 'steps', 'name', 'hooks', 'exporters',
-        'start_delay_secs', 'throttle_secs'
+        'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs',
+        'throttle_secs'
     ])):
   """Configuration for the "eval" part for the `train_and_evaluate` call.
 
@@ -417,8 +408,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   Raises:
     ValueError: if environment variable `TF_CONFIG` is incorrectly set.
   """
-  executor = _TrainingExecutor(estimator=estimator, train_spec=train_spec,
-                               eval_spec=eval_spec)
+  executor = _TrainingExecutor(
+      estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
 
   config = estimator.config
   if (config.task_type == run_config_lib.TaskType.EVALUATOR and
@@ -561,9 +552,8 @@ class _TrainingExecutor(object):
           self._timer.update_last_triggered_step(global_step_value)
           self._evaluator.evaluate_and_export()
         else:
-          logging.info(
-              'Skip the current checkpoint eval due to throttle secs '
-              '({} secs).'.format(self._eval_throttle_secs))
+          logging.info('Skip the current checkpoint eval due to throttle secs '
+                       '({} secs).'.format(self._eval_throttle_secs))
 
     # Final export signal: For any eval result with global_step >= train
     # max_steps, the evaluator will send the final export signal. There is a
@@ -576,8 +566,8 @@ class _TrainingExecutor(object):
     #
     # But here, throttle_secs will skip the next intermediate checkpoint and,
     # so, the double final export chance is very small.
-    evaluator = _TrainingExecutor._Evaluator(
-        self._estimator, self._eval_spec, self._train_spec.max_steps)
+    evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
+                                             self._train_spec.max_steps)
 
     # When the underlying `Estimator` object saves a new checkpoint, we would
     # like this callback to be called so that evaluation and export can trigger.
@@ -617,8 +607,7 @@ class _TrainingExecutor(object):
       raise ValueError('eval_spec.throttle_secs should be positive, given: {}.'
                        'It is used do determine how long each training '
                        'iteration should go when train and evaluate '
-                       'locally.'.format(
-                           self._eval_spec.throttle_secs))
+                       'locally.'.format(self._eval_spec.throttle_secs))
 
     stop_hook = _StopAtSecsHook(self._eval_spec.throttle_secs)
     train_hooks = (
@@ -663,8 +652,9 @@ class _TrainingExecutor(object):
 
     if not config.master:
       jobs = config.cluster_spec.jobs
-      if (len(jobs) == 1 and len(config.cluster_spec.job_tasks(jobs[0])) == 1
-          and config.task_type in _TRAINER_JOBS):
+      if (len(jobs) == 1 and
+          len(config.cluster_spec.job_tasks(jobs[0])) == 1 and
+          config.task_type in _TRAINER_JOBS):
         # For distributed training, config.master is empty if and only if it has
         # a single node in the cluster spec. In this case, we should not start
         # the server.
@@ -679,9 +669,9 @@ class _TrainingExecutor(object):
     logging.info('Start Tensorflow server.')
 
     if config.session_config is None:
-      session_config=config_pb2.ConfigProto(log_device_placement=False)
+      session_config = config_pb2.ConfigProto(log_device_placement=False)
     else:
-      session_config=config_pb2.ConfigProto(
+      session_config = config_pb2.ConfigProto(
           log_device_placement=False,
           gpu_options=config.session_config.gpu_options)
 
@@ -744,8 +734,7 @@ class _TrainingExecutor(object):
             global_step >= self._train_spec.max_steps):
           logging.info(
               'Exiting evaluation, global_step=%s >= train max_steps=%s',
-              global_step,
-              self._train_spec.max_steps)
+              global_step, self._train_spec.max_steps)
           return
 
       latest_eval_result, should_early_stop = self._execute_evaluator_once(
@@ -781,10 +770,9 @@ class _TrainingExecutor(object):
 
     # Throttle if necessary.
     elapsed_time = time.time() - start
-    difference = throttle_secs  - elapsed_time
+    difference = throttle_secs - elapsed_time
     if difference > 0:
-      logging.info('Waiting %f secs before starting next eval run.',
-                   difference)
+      logging.info('Waiting %f secs before starting next eval run.', difference)
       time.sleep(difference)
 
     return (eval_result, should_early_stop)
@@ -929,8 +917,8 @@ class _EvalResult(
       if checkpoint_path:
         raise ValueError(
             'checkpoint must be `None` if status is not {}; got status {}, '
-            'checkpoint_path {}'.format(
-                _EvalStatus.EVALUATED, status, checkpoint_path))
+            'checkpoint_path {}'.format(_EvalStatus.EVALUATED, status,
+                                        checkpoint_path))
       return super(_EvalResult, cls).__new__(cls, status, metrics,
                                              checkpoint_path)
 
diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index c748b318b730f4a4ff855c5e4558da88ada9581b..48110ef57fcba897cc495323973b2f6761c3add4 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -402,10 +402,10 @@ def _warm_start_var_with_vocab(var,
 
 
 def _warm_start(warm_start_settings):
-  """Warmstarts a model using the given settings.
+  """Warm-starts a model using the given settings.
 
-  Currently, this is intended for use only in canned Estimators.  Once made
-  public, it can be used in any model_fn.
+  If you are using a tf.estimator.Estimator, this will automatically be called
+  during training.
 
   Args:
     warm_start_settings: An object of `WarmStartSettings`.
@@ -415,8 +415,8 @@ def _warm_start(warm_start_settings):
       a stronger check for variable configuration than relying on users to
       examine the logs.
   """
-  logging.info("Warm-starting from: ",
-               warm_start_settings.ckpt_to_initialize_from)
+  logging.info("Warm-starting from: %s",
+               (warm_start_settings.ckpt_to_initialize_from,))
   # We have to deal with partitioned variables, since get_collection flattens
   # out the list.
   grouped_variables = {}
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 7feb209cc49c4be70387c44168dbdeea6d108d66..5947d8f6e2348b12dae8f8ee05c26ecd9e342fcd 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -157,6 +157,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _internal_input_layer(features,
@@ -209,6 +210,7 @@ def _internal_input_layer(features,
     return array_ops.concat(output_tensors, 1)
 
 
+@tf_export('feature_column.input_layer')
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -329,6 +331,7 @@ class InputLayer(object):
     return self._input_layer_template.weights
 
 
+@tf_export('feature_column.linear_model')
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -498,6 +501,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
+@tf_export('feature_column.make_parse_example_spec')
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -557,6 +561,7 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
+@tf_export('feature_column.embedding_column')
 def embedding_column(
     categorical_column, dimension, combiner='mean', initializer=None,
     ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
@@ -807,6 +812,7 @@ def shared_embedding_columns(
   return result
 
 
+@tf_export('feature_column.numeric_column')
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -881,6 +887,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -970,6 +977,7 @@ def _assert_string_or_int(dtype, prefix):
         '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1026,6 +1034,7 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_file')
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1145,6 +1154,7 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
 def categorical_column_with_vocabulary_list(
     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
@@ -1255,6 +1265,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
@@ -1322,6 +1333,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1350,6 +1362,7 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
+@tf_export('feature_column.weighted_categorical_column')
 def weighted_categorical_column(
     categorical_column, weight_feature_key, dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
@@ -1424,6 +1437,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 67ccf990d6a0e59c965ff76c2ba601be2a64060a..c825114483697f63163d5a33365049b7c2b539a3 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Library of dtypes (Tensor element types)."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.util.tf_export import tf_export
 
-
 _np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
@@ -83,8 +80,8 @@ class DType(object):
     # TODO(mrry): Make the necessary changes (using __new__) to ensure
     # that calling this returns one of the interned values.
     type_enum = int(type_enum)
-    if (type_enum not in types_pb2.DataType.values()
-        or type_enum == types_pb2.DT_INVALID):
+    if (type_enum not in types_pb2.DataType.values() or
+        type_enum == types_pb2.DT_INVALID):
       raise TypeError(
           "type_enum is not a valid types_pb2.DataType: %s" % type_enum)
     self._type_enum = type_enum
@@ -123,10 +120,10 @@ class DType(object):
 
   @property
   def is_numpy_compatible(self):
-    numpy_incompatible = [types_pb2.DT_VARIANT,
-                          types_pb2.DT_VARIANT_REF,
-                          types_pb2.DT_RESOURCE,
-                          types_pb2.DT_RESOURCE_REF]
+    numpy_incompatible = [
+        types_pb2.DT_VARIANT, types_pb2.DT_VARIANT_REF, types_pb2.DT_RESOURCE,
+        types_pb2.DT_RESOURCE_REF
+    ]
     return self._type_enum not in numpy_incompatible
 
   @property
@@ -153,9 +150,9 @@ class DType(object):
   @property
   def is_floating(self):
     """Returns whether this is a (non-quantized, real) floating point type."""
-    return ((self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
-                                                        np.floating))
-            or self.base_dtype == bfloat16)
+    return ((self.is_numpy_compatible and
+             np.issubdtype(self.as_numpy_dtype, np.floating)) or
+            self.base_dtype == bfloat16)
 
   @property
   def is_complex(self):
@@ -190,8 +187,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find minimum value of %s." % self)
 
     # there is no simple way to get the min value of a dtype, we have to check
@@ -214,8 +211,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find maximum value of %s." % self)
 
     # there is no simple way to get the max value of a dtype, we have to check
@@ -266,8 +263,8 @@ class DType(object):
       this `DType`.
     """
     other = as_dtype(other)
-    return self._type_enum in (
-        other.as_datatype_enum, other.base_dtype.as_datatype_enum)
+    return self._type_enum in (other.as_datatype_enum,
+                               other.base_dtype.as_datatype_enum)
 
   def __eq__(self, other):
     """Returns True iff this DType refers to the same type as `other`."""
@@ -307,19 +304,22 @@ class DType(object):
       return 1
     return np.dtype(self.as_numpy_dtype).itemsize
 
+
 # Define data type range of numpy dtype
-dtype_range = {np.bool_: (False, True),
-               np.bool8: (False, True),
-               np.uint8: (0, 255),
-               np.uint16: (0, 65535),
-               np.int8: (-128, 127),
-               np.int16: (-32768, 32767),
-               np.int64: (-2**63, 2**63 - 1),
-               np.uint64: (0, 2**64 - 1),
-               np.int32: (-2**31, 2**31 - 1),
-               np.uint32: (0, 2**32 - 1),
-               np.float32: (-1, 1),
-               np.float64: (-1, 1)}
+dtype_range = {
+    np.bool_: (False, True),
+    np.bool8: (False, True),
+    np.uint8: (0, 255),
+    np.uint16: (0, 65535),
+    np.int8: (-128, 127),
+    np.int16: (-32768, 32767),
+    np.int64: (-2**63, 2**63 - 1),
+    np.uint64: (0, 2**64 - 1),
+    np.int32: (-2**31, 2**31 - 1),
+    np.uint32: (0, 2**32 - 1),
+    np.float32: (-1, 1),
+    np.float64: (-1, 1)
+}
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
@@ -396,7 +396,6 @@ quint16_ref = DType(types_pb2.DT_QUINT16_REF)
 qint32_ref = DType(types_pb2.DT_QINT32_REF)
 bfloat16_ref = DType(types_pb2.DT_BFLOAT16_REF)
 
-
 # Maintain an intern table so that we don't have to create a large
 # number of small objects.
 _INTERN_TABLE = {
@@ -448,7 +447,6 @@ _INTERN_TABLE = {
     types_pb2.DT_VARIANT_REF: variant_ref,
 }
 
-
 # Standard mappings between types_pb2.DataType values and string names.
 _TYPE_TO_STRING = {
     types_pb2.DT_HALF: "float16",
@@ -498,8 +496,10 @@ _TYPE_TO_STRING = {
     types_pb2.DT_RESOURCE_REF: "resource_ref",
     types_pb2.DT_VARIANT_REF: "variant_ref",
 }
-_STRING_TO_TF = {value: _INTERN_TABLE[key]
-                 for key, value in _TYPE_TO_STRING.items()}
+_STRING_TO_TF = {
+    value: _INTERN_TABLE[key]
+    for key, value in _TYPE_TO_STRING.items()
+}
 # Add non-canonical aliases.
 _STRING_TO_TF["half"] = float16
 _STRING_TO_TF["half_ref"] = float16_ref
@@ -508,7 +508,6 @@ _STRING_TO_TF["float_ref"] = float32_ref
 _STRING_TO_TF["double"] = float64
 _STRING_TO_TF["double_ref"] = float64_ref
 
-
 # Numpy representation for quantized dtypes.
 #
 # These are magic strings that are used in the swig wrapper to identify
@@ -551,58 +550,100 @@ _NP_TO_TF = frozenset([
     (_np_bfloat16, bfloat16),
 ])
 _TF_TO_NP = {
-    types_pb2.DT_HALF: np.float16,
-    types_pb2.DT_FLOAT: np.float32,
-    types_pb2.DT_DOUBLE: np.float64,
-    types_pb2.DT_INT32: np.int32,
-    types_pb2.DT_UINT8: np.uint8,
-    types_pb2.DT_UINT16: np.uint16,
-    types_pb2.DT_UINT32: np.uint32,
-    types_pb2.DT_UINT64: np.uint64,
-    types_pb2.DT_INT16: np.int16,
-    types_pb2.DT_INT8: np.int8,
+    types_pb2.DT_HALF:
+        np.float16,
+    types_pb2.DT_FLOAT:
+        np.float32,
+    types_pb2.DT_DOUBLE:
+        np.float64,
+    types_pb2.DT_INT32:
+        np.int32,
+    types_pb2.DT_UINT8:
+        np.uint8,
+    types_pb2.DT_UINT16:
+        np.uint16,
+    types_pb2.DT_UINT32:
+        np.uint32,
+    types_pb2.DT_UINT64:
+        np.uint64,
+    types_pb2.DT_INT16:
+        np.int16,
+    types_pb2.DT_INT8:
+        np.int8,
     # NOTE(touts): For strings we use np.object as it supports variable length
     # strings.
-    types_pb2.DT_STRING: np.object,
-    types_pb2.DT_COMPLEX64: np.complex64,
-    types_pb2.DT_COMPLEX128: np.complex128,
-    types_pb2.DT_INT64: np.int64,
-    types_pb2.DT_BOOL: np.bool,
-    types_pb2.DT_QINT8: _np_qint8,
-    types_pb2.DT_QUINT8: _np_quint8,
-    types_pb2.DT_QINT16: _np_qint16,
-    types_pb2.DT_QUINT16: _np_quint16,
-    types_pb2.DT_QINT32: _np_qint32,
-    types_pb2.DT_BFLOAT16: _np_bfloat16,
+    types_pb2.DT_STRING:
+        np.object,
+    types_pb2.DT_COMPLEX64:
+        np.complex64,
+    types_pb2.DT_COMPLEX128:
+        np.complex128,
+    types_pb2.DT_INT64:
+        np.int64,
+    types_pb2.DT_BOOL:
+        np.bool,
+    types_pb2.DT_QINT8:
+        _np_qint8,
+    types_pb2.DT_QUINT8:
+        _np_quint8,
+    types_pb2.DT_QINT16:
+        _np_qint16,
+    types_pb2.DT_QUINT16:
+        _np_quint16,
+    types_pb2.DT_QINT32:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16:
+        _np_bfloat16,
 
     # Ref types
-    types_pb2.DT_HALF_REF: np.float16,
-    types_pb2.DT_FLOAT_REF: np.float32,
-    types_pb2.DT_DOUBLE_REF: np.float64,
-    types_pb2.DT_INT32_REF: np.int32,
-    types_pb2.DT_UINT32_REF: np.uint32,
-    types_pb2.DT_UINT8_REF: np.uint8,
-    types_pb2.DT_UINT16_REF: np.uint16,
-    types_pb2.DT_INT16_REF: np.int16,
-    types_pb2.DT_INT8_REF: np.int8,
-    types_pb2.DT_STRING_REF: np.object,
-    types_pb2.DT_COMPLEX64_REF: np.complex64,
-    types_pb2.DT_COMPLEX128_REF: np.complex128,
-    types_pb2.DT_INT64_REF: np.int64,
-    types_pb2.DT_UINT64_REF: np.uint64,
-    types_pb2.DT_BOOL_REF: np.bool,
-    types_pb2.DT_QINT8_REF: _np_qint8,
-    types_pb2.DT_QUINT8_REF: _np_quint8,
-    types_pb2.DT_QINT16_REF: _np_qint16,
-    types_pb2.DT_QUINT16_REF: _np_quint16,
-    types_pb2.DT_QINT32_REF: _np_qint32,
-    types_pb2.DT_BFLOAT16_REF: _np_bfloat16,
+    types_pb2.DT_HALF_REF:
+        np.float16,
+    types_pb2.DT_FLOAT_REF:
+        np.float32,
+    types_pb2.DT_DOUBLE_REF:
+        np.float64,
+    types_pb2.DT_INT32_REF:
+        np.int32,
+    types_pb2.DT_UINT32_REF:
+        np.uint32,
+    types_pb2.DT_UINT8_REF:
+        np.uint8,
+    types_pb2.DT_UINT16_REF:
+        np.uint16,
+    types_pb2.DT_INT16_REF:
+        np.int16,
+    types_pb2.DT_INT8_REF:
+        np.int8,
+    types_pb2.DT_STRING_REF:
+        np.object,
+    types_pb2.DT_COMPLEX64_REF:
+        np.complex64,
+    types_pb2.DT_COMPLEX128_REF:
+        np.complex128,
+    types_pb2.DT_INT64_REF:
+        np.int64,
+    types_pb2.DT_UINT64_REF:
+        np.uint64,
+    types_pb2.DT_BOOL_REF:
+        np.bool,
+    types_pb2.DT_QINT8_REF:
+        _np_qint8,
+    types_pb2.DT_QUINT8_REF:
+        _np_quint8,
+    types_pb2.DT_QINT16_REF:
+        _np_qint16,
+    types_pb2.DT_QUINT16_REF:
+        _np_quint16,
+    types_pb2.DT_QINT32_REF:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16_REF:
+        _np_bfloat16,
 }
 
-
-QUANTIZED_DTYPES = frozenset(
-    [qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
-     quint16_ref, qint32_ref])
+QUANTIZED_DTYPES = frozenset([
+    qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
+    quint16_ref, qint32_ref
+])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
 
@@ -613,7 +654,8 @@ def as_dtype(type_value):
   Args:
     type_value: A value that can be converted to a `tf.DType`
       object. This may currently be a `tf.DType` object, a
-      [`DataType` enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
+      [`DataType`
+        enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
       a string type name, or a `numpy.dtype`.
 
   Returns:
@@ -650,5 +692,4 @@ def as_dtype(type_value):
     except TypeError as e:
       raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
 
-  raise TypeError(
-      "Cannot convert value %r to a TensorFlow DType." % type_value)
+  raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 57e5a724c99bd77df8cd11eff99288fa6647f4ac..a4ca3f9a89bd4cce2240d90895c43dda1acb849b 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -451,13 +452,17 @@ class FunctionTest(test.TestCase):
                                          lambda y: AssertFail(y), [x])
       # pylint: enable=unnecessary-lambda
 
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
     # Enables inlining.
-    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L0,
-            do_common_subexpression_elimination=True,
-            do_function_inlining=True,
-            do_constant_folding=True)))
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0,
+                do_common_subexpression_elimination=True,
+                do_function_inlining=True,
+                do_constant_folding=True),
+            rewrite_options=rewriter_config))
 
     with session.Session(config=config) as sess:
       # Since the 'False' branch is not taken, the assertion should not fire.
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 00fff8d040d6facfc81359061f6cf9a1cf6d3d3c..c26644362c86798012b5812f3554662a6d1d680b 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A utility function for importing TensorFlow graphs."""
 from __future__ import absolute_import
 from __future__ import division
@@ -43,8 +42,8 @@ from tensorflow.python.util.tf_export import tf_export
 # the logic here.
 def _GetNodeAttr(node_def, attr_name):
   if attr_name not in node_def.attr:
-    raise ValueError('Expected one attr with name %r in %s.'
-                     % (attr_name, str(node_def)))
+    raise ValueError('Expected one attr with name %r in %s.' % (attr_name,
+                                                                str(node_def)))
   return node_def.attr[attr_name]
 
 
@@ -170,9 +169,8 @@ def _ProcessInputMapParam(input_map):
   if input_map is None:
     input_map = {}
   else:
-    if not (isinstance(input_map, dict)
-            and all(isinstance(k, compat.bytes_or_text_types)
-                    for k in input_map.keys())):
+    if not (isinstance(input_map, dict) and all(
+        isinstance(k, compat.bytes_or_text_types) for k in input_map.keys())):
       raise TypeError('input_map must be a dictionary mapping strings to '
                       'Tensor objects.')
   return input_map
@@ -180,9 +178,10 @@ def _ProcessInputMapParam(input_map):
 
 def _ProcessReturnElementsParam(return_elements):
   """Type-checks and possibly canonicalizes `return_elements`."""
-  if return_elements is None: return None
-  if not all(isinstance(x, compat.bytes_or_text_types)
-             for x in return_elements):
+  if return_elements is None:
+    return None
+  if not all(
+      isinstance(x, compat.bytes_or_text_types) for x in return_elements):
     raise TypeError('return_elements must be a list of strings.')
   return tuple(compat.as_str(x) for x in return_elements)
 
@@ -262,14 +261,14 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
     if input_src.startswith('^'):
       src_name = compat.as_bytes(input_src[1:])
       dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
-      c_api.TF_ImportGraphDefOptionsRemapControlDependency(options, src_name,
-                                                           dst_op)
+      c_api.TF_ImportGraphDefOptionsRemapControlDependency(
+          options, src_name, dst_op)
     else:
       src_name, src_idx = _ParseTensorName(input_src)
       src_name = compat.as_str(src_name)
       dst_output = input_dst._as_tf_output()  # pylint: disable=protected-access
-      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name,
-                                                    src_idx, dst_output)
+      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name, src_idx,
+                                                    dst_output)
   for name in return_elements or []:
     if ':' in name:
       op_name, index = _ParseTensorName(name)
@@ -315,8 +314,8 @@ def _ProcessNewOps(graph):
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
         raise ValueError('Specified colocation to an op that '
-                         'does not exist during import: %s in %s' % (
-                             coloc_op_name, op.name))
+                         'does not exist during import: %s in %s' %
+                         (coloc_op_name, op.name))
       if coloc_op.device:
         coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
         break
@@ -373,10 +372,13 @@ def _GatherReturnElements(requested_return_elements, graph, results):
 @tf_export('import_graph_def')
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
-                 ' on this feature.',
-                 'op_dict')
-def import_graph_def(graph_def, input_map=None, return_elements=None,
-                     name=None, op_dict=None, producer_op_list=None):
+                 ' on this feature.', 'op_dict')
+def import_graph_def(graph_def,
+                     input_map=None,
+                     return_elements=None,
+                     name=None,
+                     op_dict=None,
+                     producer_op_list=None):
   """Imports the graph from `graph_def` into the current default `Graph`.
 
   This function provides a way to import a serialized TensorFlow
@@ -480,11 +482,12 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
         c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
             results))
     if missing_unused_input_keys:
-      missing_unused_input_keys = [compat.as_str(s)
-                                   for s in missing_unused_input_keys]
+      missing_unused_input_keys = [
+          compat.as_str(s) for s in missing_unused_input_keys
+      ]
       raise ValueError(
-          'Attempted to map inputs that were not found in graph_def: [%s]'
-          % ', '.join(missing_unused_input_keys))
+          'Attempted to map inputs that were not found in graph_def: [%s]' %
+          ', '.join(missing_unused_input_keys))
 
     if return_elements is None:
       return None
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index a1c2e07e94fb8d35761b3938d82fc8c8664437c1..d5786cac68dc31210f45f0af9ff6c347d93c026f 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -481,13 +481,17 @@ class Tensor(_TensorLike):
           dim_list.append(-1)
         else:
           dim_list.append(dim.value)
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_api.TF_GraphSetTensorShape_wrapper(
-          self._op._graph._c_graph,  # pylint: disable=protected-access
-          self._as_tf_output(),
-          dim_list,
-          unknown_shape,
-          status)
+    try:
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.TF_GraphSetTensorShape_wrapper(
+            self._op._graph._c_graph,  # pylint: disable=protected-access
+            self._as_tf_output(),
+            dim_list,
+            unknown_shape,
+            status)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
 
   @property
   def value_index(self):
@@ -2099,6 +2103,10 @@ class Operation(object):
     logging.warning("Operation._control_inputs is private, use "
                     "Operation.control_inputs instead. "
                     "Operation._control_inputs will eventually be removed.")
+    # Copy value because it may be self._control_inputs_val (in particular if
+    # this is called from self._control_inputs += ...), and we don't want to
+    # clear value below.
+    value = copy.copy(value)
     self._remove_all_control_inputs()
     self._add_control_inputs(value)
 
@@ -5000,9 +5008,22 @@ def init_scope():
   """
   # pylint: enable=g-doc-return-or-yield,line-too-long
 
+  in_graph_mode = context.in_graph_mode()
+  # Retrieve the active name scope: entering an `init_scope` preserves
+  # the name scope of the current context.
+  if in_graph_mode:
+    default_graph = get_default_graph()
+    scope = default_graph.get_name_scope()
+  else:
+    scope = context.context().scope_name
+  if scope and scope[-1] != '/':
+    # Names that end with trailing slashes are treated by `name_scope` as
+    # absolute.
+    scope = scope + '/'
+
   outer_context = None
-  if context.in_graph_mode() and not _default_graph_stack.stack:
-    outer_context = get_default_graph().as_default
+  if in_graph_mode and not _default_graph_stack.stack:
+    outer_context = default_graph.as_default
   else:
     for stack_entry in reversed(context.context_stack.stack):
       if not stack_entry.is_building_function:
@@ -5014,7 +5035,8 @@ def init_scope():
                          "eager context was previously active.")
 
   try:
-    with outer_context(), control_dependencies(None), tape.stop_recording():
+    with outer_context(), name_scope(scope), control_dependencies(
+        None), tape.stop_recording():
       yield
   finally:
     pass
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 78519f108ba69a8f3f296debf2e199d6613bf86a..c5e177d521013417d28fb01a975208ba5e4cf90b 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2072,10 +2072,34 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       # pylint: disable=protected-access
       self.assertEqual(len(ops._default_graph_stack.stack), 0)
       with ops.init_scope():
-        self.assertEqual(len(ops._default_graph_stack.stack), 1)
+        self.assertGreater(len(ops._default_graph_stack.stack), 0)
       self.assertEqual(len(ops._default_graph_stack.stack), 0)
       # pylint: enable=protected-access
 
+  def testPreservesNameScopeInGraphConstruction(self):
+    with ops.Graph().as_default():
+      function_graph = ops.Graph()
+      with function_graph.as_default():
+        with ops.name_scope("inner"), ops.init_scope():
+          self.assertEqual(ops.get_name_scope(), "inner")
+      self.assertEqual(ops.get_name_scope(), "")
+
+  def testPreservesNameScopeInEagerExecution(self):
+    with context.eager_mode():
+      def foo():
+        with ops.name_scope("inner"), ops.init_scope():
+          if context.in_graph_mode():
+            self.assertEqual(ops.get_name_scope(), "inner")
+          else:
+            # A trailing slash is always appended when eager execution is
+            # enabled.
+            self.assertEqual(context.context().scope_name, "inner/")
+      foo()
+      self.assertEqual(ops.get_name_scope(), "")
+      foo_compiled = eager_function.defun(foo)
+      foo_compiled()
+      self.assertEqual(ops.get_name_scope(), "")
+
 
 @test_util.with_c_api
 class GraphTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index d2b8e80305724fd12341bc089d8e0a63c40b6688..0e5f696111ae7f74b41f8af21a5190fc2617e51a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Utilities to create TensorProtos."""
 from __future__ import absolute_import
 from __future__ import division
@@ -39,6 +38,7 @@ except ImportError:
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
+
 # pylint: enable=g-import-not-at-top
 
 
@@ -47,8 +47,8 @@ def ExtractBitsFromFloat16(x):
 
 
 def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
-  tensor_proto.half_val.extend([
-      ExtractBitsFromFloat16(x) for x in proto_values])
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromFloat16(x) for x in proto_values])
 
 
 def ExtractBitsFromBFloat16(x):
@@ -57,31 +57,47 @@ def ExtractBitsFromBFloat16(x):
 
 
 def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
-  tensor_proto.half_val.extend([
-      ExtractBitsFromBFloat16(x) for x in proto_values])
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromBFloat16(x) for x in proto_values])
 
 
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
-      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
+      dtypes.bfloat16.as_numpy_dtype:
+          SlowAppendBFloat16ArrayToTensorProto,
       # TODO(sesse): We should have a
       # fast_tensor_util.AppendFloat16ArrayToTensorProto,
       # but it seems np.float16_t doesn't exist?
-      np.float16: SlowAppendFloat16ArrayToTensorProto,
-      np.float32: fast_tensor_util.AppendFloat32ArrayToTensorProto,
-      np.float64: fast_tensor_util.AppendFloat64ArrayToTensorProto,
-      np.int32: fast_tensor_util.AppendInt32ArrayToTensorProto,
-      np.int64: fast_tensor_util.AppendInt64ArrayToTensorProto,
-      np.uint8: fast_tensor_util.AppendUInt8ArrayToTensorProto,
-      np.uint16: fast_tensor_util.AppendUInt16ArrayToTensorProto,
-      np.uint32: fast_tensor_util.AppendUInt32ArrayToTensorProto,
-      np.uint64: fast_tensor_util.AppendUInt64ArrayToTensorProto,
-      np.int8: fast_tensor_util.AppendInt8ArrayToTensorProto,
-      np.int16: fast_tensor_util.AppendInt16ArrayToTensorProto,
-      np.complex64: fast_tensor_util.AppendComplex64ArrayToTensorProto,
-      np.complex128: fast_tensor_util.AppendComplex128ArrayToTensorProto,
-      np.object: fast_tensor_util.AppendObjectArrayToTensorProto,
-      np.bool: fast_tensor_util.AppendBoolArrayToTensorProto,
+      np.float16:
+          SlowAppendFloat16ArrayToTensorProto,
+      np.float32:
+          fast_tensor_util.AppendFloat32ArrayToTensorProto,
+      np.float64:
+          fast_tensor_util.AppendFloat64ArrayToTensorProto,
+      np.int32:
+          fast_tensor_util.AppendInt32ArrayToTensorProto,
+      np.int64:
+          fast_tensor_util.AppendInt64ArrayToTensorProto,
+      np.uint8:
+          fast_tensor_util.AppendUInt8ArrayToTensorProto,
+      np.uint16:
+          fast_tensor_util.AppendUInt16ArrayToTensorProto,
+      np.uint32:
+          fast_tensor_util.AppendUInt32ArrayToTensorProto,
+      np.uint64:
+          fast_tensor_util.AppendUInt64ArrayToTensorProto,
+      np.int8:
+          fast_tensor_util.AppendInt8ArrayToTensorProto,
+      np.int16:
+          fast_tensor_util.AppendInt16ArrayToTensorProto,
+      np.complex64:
+          fast_tensor_util.AppendComplex64ArrayToTensorProto,
+      np.complex128:
+          fast_tensor_util.AppendComplex128ArrayToTensorProto,
+      np.object:
+          fast_tensor_util.AppendObjectArrayToTensorProto,
+      np.bool:
+          fast_tensor_util.AppendBoolArrayToTensorProto,
       dtypes.qint8.as_numpy_dtype:
           fast_tensor_util.AppendInt8ArrayToTensorProto,
       dtypes.quint8.as_numpy_dtype:
@@ -118,14 +134,12 @@ else:
     tensor_proto.uint64_val.extend([np.asscalar(x) for x in proto_values])
 
   def SlowAppendComplex64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.scomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.scomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendComplex128ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.dcomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.dcomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendObjectArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
@@ -252,15 +266,16 @@ def _FilterTuple(v):
       return None
   if isinstance(v, list):
     if not any(isinstance(x, (list, tuple)) for x in v):
-      return _FirstNotNone([None if isinstance(x, (list, tuple)) else x for x in v])
+      return _FirstNotNone(
+          [None if isinstance(x, (list, tuple)) else x for x in v])
   return _FirstNotNone([_FilterTuple(x) for x in v])
 
 
 def _FilterInt(v):
   if isinstance(v, (list, tuple)):
     return _FirstNotNone([_FilterInt(x) for x in v])
-  return None if isinstance(v, (compat.integral_types,
-                                tensor_shape.Dimension)) else _NotNone(v)
+  return None if isinstance(
+      v, (compat.integral_types, tensor_shape.Dimension)) else _NotNone(v)
 
 
 def _FilterFloat(v):
@@ -380,8 +395,11 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   if dtype:
     dtype = dtypes.as_dtype(dtype)
 
-  is_quantized = (dtype in [dtypes.qint8, dtypes.quint8, dtypes.qint16,
-                            dtypes.quint16, dtypes.qint32])
+  is_quantized = (
+      dtype in [
+          dtypes.qint8, dtypes.quint8, dtypes.qint16, dtypes.quint16,
+          dtypes.qint32
+      ])
 
   # We first convert value to a numpy array or scalar.
   if isinstance(values, (np.ndarray, np.generic)):
@@ -419,9 +437,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       if (list(nparray.shape) != _GetDenseDimensions(values) and
           not is_quantized):
         raise ValueError("""Argument must be a dense tensor: %s"""
-                         """ - got shape %s, but wanted %s.""" % (
-                             values, list(nparray.shape),
-                             _GetDenseDimensions(values)))
+                         """ - got shape %s, but wanted %s.""" %
+                         (values, list(nparray.shape),
+                          _GetDenseDimensions(values)))
 
     # python/numpy default float type is float64. We prefer float32 instead.
     if (nparray.dtype == np.float64) and dtype is None:
@@ -446,8 +464,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   if dtype is not None and (not hasattr(dtype, "base_dtype") or
                             dtype.base_dtype != numpy_dtype.base_dtype):
-    raise TypeError("Incompatible types: %s vs. %s. Value is %s"
-                    % (dtype, nparray.dtype, values))
+    raise TypeError("Incompatible types: %s vs. %s. Value is %s" %
+                    (dtype, nparray.dtype, values))
 
   # If shape is not given, get the shape from the numpy array.
   if shape is None:
@@ -510,8 +528,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   append_fn = GetNumpyAppendFn(proto_values.dtype)
   if append_fn is None:
-    raise TypeError("Element type not supported in TensorProto: %s" %
-                    numpy_dtype.name)
+    raise TypeError(
+        "Element type not supported in TensorProto: %s" % numpy_dtype.name)
   append_fn(tensor_proto, proto_values)
 
   return tensor_proto
@@ -553,19 +571,23 @@ def MakeNdarray(tensor):
       return tmp.reshape(shape)
   elif tensor_dtype == dtypes.float32:
     if len(tensor.float_val) == 1:
-      return np.repeat(np.array(tensor.float_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.float_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.float_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.float64:
     if len(tensor.double_val) == 1:
-      return np.repeat(np.array(tensor.double_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.double_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape)
-  elif tensor_dtype in [dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16,
-                        dtypes.int8, dtypes.qint32, dtypes.quint8, dtypes.qint8,
-                        dtypes.qint16, dtypes.quint16, dtypes.bfloat16]:
+  elif tensor_dtype in [
+      dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8,
+      dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16,
+      dtypes.bfloat16
+  ]:
     if len(tensor.int_val) == 1:
       return np.repeat(np.array(tensor.int_val[0], dtype=dtype),
                        num_elements).reshape(shape)
@@ -573,35 +595,41 @@ def MakeNdarray(tensor):
       return np.fromiter(tensor.int_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.int64:
     if len(tensor.int64_val) == 1:
-      return np.repeat(np.array(tensor.int64_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.int64_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.int64_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.string:
     if len(tensor.string_val) == 1:
-      return np.repeat(np.array(tensor.string_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.string_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
-      return np.array([x for x in tensor.string_val],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [x for x in tensor.string_val], dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex64:
     it = iter(tensor.scomplex_val)
     if len(tensor.scomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.scomplex_val[0],
-                                        tensor.scomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.scomplex_val[0], tensor.scomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex128:
     it = iter(tensor.dcomplex_val)
     if len(tensor.dcomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.dcomplex_val[0],
-                                        tensor.dcomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.dcomplex_val[0], tensor.dcomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.bool:
     if len(tensor.bool_val) == 1:
       return np.repeat(np.array(tensor.bool_val[0], dtype=dtype),
@@ -645,8 +673,9 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Shape":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.is_fully_defined():
-      return np.array([dim.value for dim in input_shape.dims],
-                      dtype=tensor.dtype.as_numpy_dtype)
+      return np.array(
+          [dim.value for dim in input_shape.dims],
+          dtype=tensor.dtype.as_numpy_dtype)
     else:
       return None
   elif tensor.op.type == "Size":
@@ -658,8 +687,10 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Rank":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.ndims is not None:
-      return np.ndarray(shape=(), buffer=np.array([input_shape.ndims], dtype=np.int32),
-                        dtype=np.int32)
+      return np.ndarray(
+          shape=(),
+          buffer=np.array([input_shape.ndims], dtype=np.int32),
+          dtype=np.int32)
     else:
       return None
   elif tensor.op.type == "Range":
@@ -861,8 +892,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
         new_axis_mask = tensor.op.get_attr("new_axis_mask")
         shrink_axis_mask = tensor.op.get_attr("shrink_axis_mask")
         valid_attributes = (not ellipsis_mask and not new_axis_mask and
-                            not shrink_axis_mask and
-                            (not begin_mask or (begin_mask == 1)) and
+                            not shrink_axis_mask and (not begin_mask or
+                                                      (begin_mask == 1)) and
                             (not end_mask or (end_mask == 1)))
         if valid_attributes:  # additional inputs not supported
           prev = constant_value_as_shape(tensor.op.inputs[0])
@@ -878,8 +909,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   ret = tensor_shape.unknown_shape(shape[0].value)
   value = constant_value(tensor)
   if value is not None:
-    ret = ret.merge_with(tensor_shape.TensorShape(
-        [d if d >= 0 else None for d in value]))
+    ret = ret.merge_with(
+        tensor_shape.TensorShape([d if d >= 0 else None for d in value]))
   return ret
 
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 0133318456219b35be11bc5ef128406292bc2feb..70f6a2acbafa43c57f1bab9ae36c2816ab4c78b5 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -53,6 +53,7 @@ from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import versions
@@ -122,11 +123,11 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     TypeError: If either argument is not a `GraphDef`.
   """
   if not isinstance(actual, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for actual, got %s" %
-                    type(actual).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
   if not isinstance(expected, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for expected, got %s" %
-                    type(expected).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for expected, got %s" % type(expected).__name__)
 
   if checkpoint_v2:
     _strip_checkpoint_v2_randomized(actual)
@@ -151,11 +152,10 @@ def assert_meta_graph_protos_equal(tester, a, b):
       a_proto = proto_type()
       b_proto = proto_type()
       # Number of entries in the collections is the same
-      tester.assertEqual(len(a_value.bytes_list.value),
-                         len(b_value.bytes_list.value))
-      for (a_value_item, b_value_item) in zip(
-          a_value.bytes_list.value,
-          b_value.bytes_list.value):
+      tester.assertEqual(
+          len(a_value.bytes_list.value), len(b_value.bytes_list.value))
+      for (a_value_item, b_value_item) in zip(a_value.bytes_list.value,
+                                              b_value.bytes_list.value):
         a_proto.ParseFromString(a_value_item)
         b_proto.ParseFromString(b_value_item)
         tester.assertProtoEquals(a_proto, b_proto)
@@ -219,10 +219,7 @@ def NHWCToNCHW(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 3, 1, 2],
-      5: [0, 4, 1, 2, 3]
-  }
+  new_axes = {4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -249,8 +246,9 @@ def NHWCToNCHW_VECT_C(input_shape_or_tensor):
   """
   permutations = {5: [0, 3, 1, 2, 4], 6: [0, 4, 1, 2, 3, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  temp_shape = (input_shape_or_tensor.shape.as_list()
-                if is_tensor else input_shape_or_tensor)
+  temp_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if temp_shape[-1] % 4 != 0:
     raise ValueError(
         "Last dimension of input must be evenly divisible by 4 to convert to "
@@ -282,8 +280,9 @@ def NCHW_VECT_CToNHWC(input_shape_or_tensor):
   """
   permutations = {5: [0, 2, 3, 1, 4], 6: [0, 2, 3, 4, 1, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  input_shape = (input_shape_or_tensor.shape.as_list()
-                 if is_tensor else input_shape_or_tensor)
+  input_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if input_shape[-1] != 4:
     raise ValueError("Last dimension of NCHW_VECT_C must be 4.")
   permutation = permutations[len(input_shape)]
@@ -306,10 +305,7 @@ def NCHWToNHWC(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 2, 3, 1],
-      5: [0, 2, 3, 4, 1]
-  }
+  new_axes = {4: [0, 2, 3, 1], 5: [0, 2, 3, 4, 1]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -324,10 +320,17 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
   prev_value = ops._USE_C_API
   ops._USE_C_API = use_c_api
   try:
-    with ops.Graph().as_default():
-      fn(*args, **kwargs)
+    # Reset the default graph so it has the C API enabled. We call
+    # reset_default_graph() instead of creating a new default Graph context to
+    # make this robust to tests that call reset_default_graph(), which requires
+    # that the current default graph isn't nested.
+    ops.reset_default_graph()
+    fn(*args, **kwargs)
   finally:
     ops._USE_C_API = prev_value
+    # Make sure default graph reflects prev_value in case next test doesn't call
+    # reset_default_graph().
+    ops.reset_default_graph()
 # pylint: disable=protected-access
 
 
@@ -344,7 +347,9 @@ def skip_if(condition):
   Returns:
     The wrapped function
   """
+
   def real_skip_if(fn):
+
     def wrapper(*args, **kwargs):
       if callable(condition):
         skip = condition()
@@ -352,7 +357,9 @@ def skip_if(condition):
         skip = condition
       if not skip:
         fn(*args, **kwargs)
+
     return wrapper
+
   return real_skip_if
 
 
@@ -369,8 +376,10 @@ def disable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, False, *args, **kwargs)
+
   return wrapper
 
 
@@ -387,8 +396,10 @@ def enable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, True, *args, **kwargs)
+
   return wrapper
 
 
@@ -560,13 +571,17 @@ def assert_no_garbage_created(f):
     # not hold on to every object in other tests.
     gc.set_debug(previous_debug_flags)
     gc.enable()
+
   return decorator
 
 
-def run_in_graph_and_eager_modes(
-    __unused__=None, graph=None, config=None,
-    use_gpu=False, force_gpu=False,
-    reset_test=True, assert_no_eager_garbage=False):
+def run_in_graph_and_eager_modes(__unused__=None,
+                                 graph=None,
+                                 config=None,
+                                 use_gpu=False,
+                                 force_gpu=False,
+                                 reset_test=True,
+                                 assert_no_eager_garbage=False):
   """Runs the test in both graph and eager modes.
 
   Args:
@@ -595,6 +610,7 @@ def run_in_graph_and_eager_modes(
 
   def decorator(f):
     """Test method decorator."""
+
     def decorated(self, **kwargs):
       """Decorated the test method."""
       with context.graph_mode():
@@ -630,6 +646,7 @@ def run_in_graph_and_eager_modes(
           run_eager_mode(self, **kwargs)
 
     return decorated
+
   return decorator
 
 
@@ -766,8 +783,10 @@ class TensorFlowTestCase(googletest.TestCase):
       self._AssertProtoEquals(expected_message, message)
     elif isinstance(expected_message_maybe_ascii, str):
       expected_message = type(message)()
-      text_format.Merge(expected_message_maybe_ascii, expected_message,
-                        descriptor_pool=descriptor_pool.Default())
+      text_format.Merge(
+          expected_message_maybe_ascii,
+          expected_message,
+          descriptor_pool=descriptor_pool.Default())
       self._AssertProtoEquals(expected_message, message)
     else:
       assert False, ("Can't compare protos of type %s and %s" %
@@ -851,7 +870,8 @@ class TensorFlowTestCase(googletest.TestCase):
     trigger the creation of a new session.
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu`
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
+    `use_gpu`
     is True, TensorFlow tries to run as many ops on the GPU as possible. If both
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
@@ -1050,6 +1070,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._threads.append(ret)
     return ret
 
+
 # pylint: enable=invalid-name
 
   def assertNear(self, f1, f2, err, msg=None):
@@ -1117,7 +1138,8 @@ class TensorFlowTestCase(googletest.TestCase):
       # the absolute difference between a and b.  Here, we want to
       # print out which elements violate such conditions.
       cond = np.logical_or(
-          np.abs(a - b) > atol + rtol * np.abs(b), np.isnan(a) != np.isnan(b))
+          np.abs(a - b) > atol + rtol * np.abs(b),
+          np.isnan(a) != np.isnan(b))
       if a.ndim:
         x = a[np.where(cond)]
         y = b[np.where(cond)]
@@ -1379,8 +1401,11 @@ class TensorFlowTestCase(googletest.TestCase):
 
 
 @tf_export("test.create_local_cluster")
-def create_local_cluster(num_workers, num_ps, protocol="grpc",
-                         worker_config=None, ps_config=None):
+def create_local_cluster(num_workers,
+                         num_ps,
+                         protocol="grpc",
+                         worker_config=None,
+                         ps_config=None):
   """Create and start local servers and return the associated `Server` objects.
 
   Example:
@@ -1430,15 +1455,21 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc",
 
   workers = [
       server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix,
-          config=worker_config, start=True)
-      for ix in range(num_workers)
+          cs,
+          job_name="worker",
+          protocol=protocol,
+          task_index=ix,
+          config=worker_config,
+          start=True) for ix in range(num_workers)
   ]
   ps_servers = [
       server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix,
-          config=ps_config, start=True)
-      for ix in range(num_ps)
+          cs,
+          job_name="ps",
+          protocol=protocol,
+          task_index=ix,
+          config=ps_config,
+          start=True) for ix in range(num_ps)
   ]
 
   return workers, ps_servers
@@ -1460,3 +1491,14 @@ def get_node_def_from_graph(node_name, graph_def):
     if node_def.name == node_name:
       return node_def
   return None
+
+
+def set_producer_version(graph, producer_version):
+  """Sets graph.graph_def_versions.producer to `producer_version`."""
+  # The C API doesn't expose altering GraphDefVersions. We can indirectly set
+  # it via import_graph_def though.
+  graph_def = graph_pb2.GraphDef()
+  graph_def.versions.producer = producer_version
+  with graph.as_default():
+    importer.import_graph_def(graph_def)
+  assert graph.graph_def_versions.producer, producer_version
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index 146bb4311cb5a44d5739821db19f33a41e6e9ce2..61dc4e2afb833414f875d66bb12b0aa010f9d62e 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -23,18 +23,33 @@ import sys
 
 from google.protobuf import text_format
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cost_analyzer
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
+from tensorflow.python.training import saver
 
 
 def main(_):
-  with gfile.GFile(FLAGS.input) as input_file:
-    metagraph = meta_graph_pb2.MetaGraphDef()
-    metagraph.ParseFromString(input_file.read())
+  if FLAGS.metagraphdef:
+    with gfile.GFile(FLAGS.metagraphdef) as meta_file:
+      metagraph = meta_graph_pb2.MetaGraphDef()
+      metagraph.ParseFromString(meta_file.read())
+  else:
+    with gfile.GFile(FLAGS.graphdef) as graph_file:
+      graph_def = graph_pb2.GraphDef()
+      graph_def.ParseFromString(graph_file.read())
+      importer.import_graph_def(graph_def, name="")
+      graph = ops.get_default_graph()
+      fetch = graph.get_operation_by_name(FLAGS.fetch)
+      graph.add_to_collection("train_op", fetch)
+      metagraph = saver.export_meta_graph(
+          graph_def=graph.as_graph_def(), graph=graph)
 
   if FLAGS.rewriter_config is not None:
     rewriter_config = rewriter_config_pb2.RewriterConfig()
@@ -49,7 +64,25 @@ def main(_):
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      "--input", type=str, default=None, help="Input .meta file path.")
+      "--metagraphdef",
+      type=str,
+      default=None,
+      help="Input .meta MetaGraphDef file path.")
+  parser.add_argument(
+      "--graphdef",
+      type=str,
+      default=None,
+      help="Input .pb GraphDef file path.")
+  # Consider making flag fetch work together with flag metagraphdef. As some
+  # MetaGraphDef files don't have collection train_op.
+  parser.add_argument(
+      "--fetch",
+      type=str,
+      default=None,
+      help=
+      "The name of the fetch node. This flag is ignored if flag "
+      "metagraphdef is used."
+  )
   parser.add_argument(
       "--rewriter_config",
       type=str,
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index f0dd4483a635ddf39e7f51ad0008390c1feb2e13..1b657983a4690dd0ddb7f569ce514b08cb10400a 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -103,6 +103,11 @@ PyObject* TF_OptimizeGraph(
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
         tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
 
+    if (!grappler_item) {
+      TF_SetStatus(out_status, TF_INVALID_ARGUMENT, "Failed to import metagraph, check error log for more info.");
+      return nullptr;
+    }
+
     tensorflow::DeviceBase* cpu_device = nullptr;
     tensorflow::GraphDef out_graph;
     tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 1f20b3ae0eb1ddf981f12f9a12c4e8153711c7f9..fdac22bb53cc7e78d854d4b5ff756a190c9c62b6 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -14,10 +14,12 @@ py_library(
         "_impl/keras/__init__.py",
         "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
+        "_impl/keras/applications/densenet.py",
         "_impl/keras/applications/imagenet_utils.py",
         "_impl/keras/applications/inception_resnet_v2.py",
         "_impl/keras/applications/inception_v3.py",
         "_impl/keras/applications/mobilenet.py",
+        "_impl/keras/applications/nasnet.py",
         "_impl/keras/applications/resnet50.py",
         "_impl/keras/applications/vgg16.py",
         "_impl/keras/applications/vgg19.py",
@@ -37,6 +39,7 @@ py_library(
         "_impl/keras/engine/__init__.py",
         "_impl/keras/engine/topology.py",
         "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_eager.py",
         "_impl/keras/estimator.py",
         "_impl/keras/initializers.py",
         "_impl/keras/layers/__init__.py",
@@ -76,9 +79,11 @@ py_library(
         "_impl/keras/wrappers/scikit_learn.py",
         "activations/__init__.py",
         "applications/__init__.py",
+        "applications/densenet/__init__.py",
         "applications/inception_resnet_v2/__init__.py",
         "applications/inception_v3/__init__.py",
         "applications/mobilenet/__init__.py",
+        "applications/nasnet/__init__.py",
         "applications/resnet50/__init__.py",
         "applications/vgg16/__init__.py",
         "applications/vgg19/__init__.py",
@@ -256,6 +261,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "densenet_test",
+    size = "large",
+    srcs = ["_impl/keras/applications/densenet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "inception_resnet_v2_test",
     size = "medium",
@@ -292,6 +309,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "nasnet_test",
+    size = "large",
+    srcs = ["_impl/keras/applications/nasnet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "resnet50_test",
     size = "small",
@@ -453,6 +482,7 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -504,7 +534,7 @@ py_test(
 
 py_test(
     name = "recurrent_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -527,7 +557,7 @@ py_test(
 
 py_test(
     name = "wrappers_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/layers/wrappers_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
@@ -691,6 +721,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "training_eager_test",
+    size = "medium",
+    srcs = ["_impl/keras/engine/training_eager_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "topology_test",
     size = "small",
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
index a70250d796b4dd8d08ac65ebdac84b307b917b13..73113539329c5493141db243b85254062f7b8f88 100644
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/__init__.py
@@ -40,4 +40,4 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.models import Sequential
 
-__version__ = '2.1.2-tf'
+__version__ = '2.1.3-tf'
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index f017d2ae85548211070ececf48e977dd7d2f6a25..4852b8c36ae5f475b33f12b7c7f21ae424ba242e 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras built-in activation functions.
+"""Built-in activation functions.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -61,10 +61,12 @@ def selu(x):
       x: A tensor or variable to compute the activation function for.
 
   Returns:
-    Tensor with the same shape and dtype as `x`.
+      Tensor with the same shape and dtype as `x`.
+
+  # Note
+      - To be used together with the initialization "lecun_normal".
+      - To be used together with the dropout variant "AlphaDropout".
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/_impl/keras/applications/__init__.py b/tensorflow/python/keras/_impl/keras/applications/__init__.py
index c11c52b71e9bff1cfd595a9dbc0e86dcaa8506c8..206a769b377483c65a78b76fe44055eb50bdc7c4 100644
--- a/tensorflow/python/keras/_impl/keras/applications/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/applications/__init__.py
@@ -18,9 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
 from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
 from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet.py b/tensorflow/python/keras/_impl/keras/applications/densenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e40d3493024c5472afca636ffc1510fad9b52d1
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet.py
@@ -0,0 +1,346 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
+"""DenseNet models for Keras.
+
+# Reference paper
+
+- [Densely Connected Convolutional Networks]
+  (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.applications import imagenet_utils
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.layers import Activation
+from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import BatchNormalization
+from tensorflow.python.keras._impl.keras.layers import Concatenate
+from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import Dense
+from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+
+
+DENSENET121_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET121_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'
+DENSENET169_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET169_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5'
+DENSENET201_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET201_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def dense_block(x, blocks, name):
+  """A dense block.
+
+  Arguments:
+      x: input tensor.
+      blocks: integer, the number of building blocks.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  for i in range(blocks):
+    x = conv_block(x, 32, name=name + '_block' + str(i + 1))
+  return x
+
+
+def transition_block(x, reduction, name):
+  """A transition block.
+
+  Arguments:
+      x: input tensor.
+      reduction: float, compression rate at transition layers.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name + '_bn')(x)
+  x = Activation('relu', name=name + '_relu')(x)
+  x = Conv2D(
+      int(K.int_shape(x)[bn_axis] * reduction),
+      1,
+      use_bias=False,
+      name=name + '_conv')(
+          x)
+  x = AveragePooling2D(2, strides=2, name=name + '_pool')(x)
+  return x
+
+
+def conv_block(x, growth_rate, name):
+  """A building block for a dense block.
+
+  Arguments:
+      x: input tensor.
+      growth_rate: float, growth rate at dense layers.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+  x1 = BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_0_bn')(
+          x)
+  x1 = Activation('relu', name=name + '_0_relu')(x1)
+  x1 = Conv2D(4 * growth_rate, 1, use_bias=False, name=name + '_1_conv')(x1)
+  x1 = BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(
+          x1)
+  x1 = Activation('relu', name=name + '_1_relu')(x1)
+  x1 = Conv2D(
+      growth_rate, 3, padding='same', use_bias=False, name=name + '_2_conv')(
+          x1)
+  x = Concatenate(axis=bn_axis, name=name + '_concat')([x, x1])
+  return x
+
+
+def DenseNet(blocks,
+             include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000):
+  """Instantiates the DenseNet architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format='channels_last'` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with
+  TensorFlow, Theano, and CNTK. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      blocks: numbers of building blocks for the four dense layers.
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 224)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=221,
+      data_format=K.image_data_format(),
+      require_flatten=include_top,
+      weights=weights)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+
+  x = ZeroPadding2D(padding=((3, 3), (3, 3)))(img_input)
+  x = Conv2D(64, 7, strides=2, use_bias=False, name='conv1/conv')(x)
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='conv1/bn')(x)
+  x = Activation('relu', name='conv1/relu')(x)
+  x = ZeroPadding2D(padding=((1, 1), (1, 1)))(x)
+  x = MaxPooling2D(3, strides=2, name='pool1')(x)
+
+  x = dense_block(x, blocks[0], name='conv2')
+  x = transition_block(x, 0.5, name='pool2')
+  x = dense_block(x, blocks[1], name='conv3')
+  x = transition_block(x, 0.5, name='pool3')
+  x = dense_block(x, blocks[2], name='conv4')
+  x = transition_block(x, 0.5, name='pool4')
+  x = dense_block(x, blocks[3], name='conv5')
+
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='bn')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='fc1000')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D(name='avg_pool')(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D(name='max_pool')(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model.
+  if blocks == [6, 12, 24, 16]:
+    model = Model(inputs, x, name='densenet121')
+  elif blocks == [6, 12, 32, 32]:
+    model = Model(inputs, x, name='densenet169')
+  elif blocks == [6, 12, 48, 32]:
+    model = Model(inputs, x, name='densenet201')
+  else:
+    model = Model(inputs, x, name='densenet')
+
+  # Load weights.
+  if weights == 'imagenet':
+    if include_top:
+      if blocks == [6, 12, 24, 16]:
+        weights_path = get_file(
+            'densenet121_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET121_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='0962ca643bae20f9b6771cb844dca3b0')
+      elif blocks == [6, 12, 32, 32]:
+        weights_path = get_file(
+            'densenet169_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET169_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='bcf9965cf5064a5f9eb6d7dc69386f43')
+      elif blocks == [6, 12, 48, 32]:
+        weights_path = get_file(
+            'densenet201_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET201_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='7bb75edd58cb43163be7e0005fbe95ef')
+    else:
+      if blocks == [6, 12, 24, 16]:
+        weights_path = get_file(
+            'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET121_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='4912a53fbd2a69346e7f2c0b5ec8c6d3')
+      elif blocks == [6, 12, 32, 32]:
+        weights_path = get_file(
+            'densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET169_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='50662582284e4cf834ce40ab4dfa58c6')
+      elif blocks == [6, 12, 48, 32]:
+        weights_path = get_file(
+            'densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET201_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='1c2de60ee40562448dbac34a0737e798')
+    model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
+
+  return model
+
+
+def DenseNet121(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 24, 16], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+def DenseNet169(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 32, 32], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+def DenseNet201(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 48, 32], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+def preprocess_input(x, data_format=None):
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments:
+      x: a 3D or 4D numpy array consists of RGB values within [0, 255].
+      data_format: data format of the image tensor.
+
+  Returns:
+      Preprocessed array.
+  """
+  return imagenet_utils.preprocess_input(x, data_format, mode='torch')
+
+
+setattr(DenseNet121, '__doc__', DenseNet.__doc__)
+setattr(DenseNet169, '__doc__', DenseNet.__doc__)
+setattr(DenseNet201, '__doc__', DenseNet.__doc__)
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet_test.py b/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b92287a1e77a944c069a6c234e11e4a79ad7d32
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DenseNet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class DenseNet121Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet121(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet121(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1024))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet121(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1024))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet121(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet121(weights='imagenet',
+                                     classes=2000)
+
+
+class DenseNet169Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet169(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet169(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1664))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet169(weights=None,
+                                           include_top=False,
+                                           pooling='max')
+    self.assertEqual(model.output_shape, (None, 1664))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet169(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet169(weights='imagenet',
+                                     classes=2000)
+
+
+class DenseNet201(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet201(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet201(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1920))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet201(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1920))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet201(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet201(weights='imagenet',
+                                     classes=2000)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index 63ee83cb51e8366f391f192a9408566076cad468..f1f20f12a8da2cb9dcb918838a4df55cdcd04602 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used by models pre-trained on ImageNet.
+"""Utilities for ImageNet data preprocessing & prediction decoding.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -35,63 +35,92 @@ _IMAGENET_MEAN = None
 
 
 def _preprocess_numpy_input(x, data_format, mode):
-  """Preprocesses a image tensor as a Numpy array.
+  """Preprocesses a Numpy array encoding a batch of images.
 
   Arguments:
-      x: input Numpy, 3D or 4D.
-      data_format: data format of the image tensor.
-      mode: One of "caffe", "tf".
+      x: Input array, 3D or 4D.
+      data_format: Data format of the image array.
+      mode: One of "caffe", "tf" or "torch".
           - caffe: will convert the images from RGB to BGR,
               then will zero-center each color channel with
               respect to the ImageNet dataset,
               without scaling.
           - tf: will scale pixels between -1 and 1,
               sample-wise.
+          - torch: will scale pixels between 0 and 1 and then
+              will normalize each channel with respect to the
+              ImageNet dataset.
 
   Returns:
-      Preprocessed array.
+      Preprocessed Numpy array.
   """
   if mode == 'tf':
     x /= 127.5
     x -= 1.
     return x
 
+  if mode == 'torch':
+    x /= 255.
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+  else:
+    if data_format == 'channels_first':
+      # 'RGB'->'BGR'
+      if x.ndim == 3:
+        x = x[::-1, ...]
+      else:
+        x = x[:, ::-1, ...]
+    else:
+      # 'RGB'->'BGR'
+      x = x[..., ::-1]
+    mean = [103.939, 116.779, 123.68]
+    std = None
+
+  # Zero-center by mean pixel
   if data_format == 'channels_first':
     if x.ndim == 3:
-      # 'RGB'->'BGR'
-      x = x[::-1, ...]
-      # Zero-center by mean pixel
-      x[0, :, :] -= 103.939
-      x[1, :, :] -= 116.779
-      x[2, :, :] -= 123.68
+      x[0, :, :] -= mean[0]
+      x[1, :, :] -= mean[1]
+      x[2, :, :] -= mean[2]
+      if std is not None:
+        x[0, :, :] /= std[0]
+        x[1, :, :] /= std[1]
+        x[2, :, :] /= std[2]
     else:
-      x = x[:, ::-1, ...]
-      x[:, 0, :, :] -= 103.939
-      x[:, 1, :, :] -= 116.779
-      x[:, 2, :, :] -= 123.68
+      x[:, 0, :, :] -= mean[0]
+      x[:, 1, :, :] -= mean[1]
+      x[:, 2, :, :] -= mean[2]
+      if std is not None:
+        x[:, 0, :, :] /= std[0]
+        x[:, 1, :, :] /= std[1]
+        x[:, 2, :, :] /= std[2]
   else:
-    # 'RGB'->'BGR'
-    x = x[..., ::-1]
-    # Zero-center by mean pixel
-    x[..., 0] -= 103.939
-    x[..., 1] -= 116.779
-    x[..., 2] -= 123.68
+    x[..., 0] -= mean[0]
+    x[..., 1] -= mean[1]
+    x[..., 2] -= mean[2]
+    if std is not None:
+      x[..., 0] /= std[0]
+      x[..., 1] /= std[1]
+      x[..., 2] /= std[2]
   return x
 
 
 def _preprocess_symbolic_input(x, data_format, mode):
-  """Preprocesses a symbolic image tensor.
+  """Preprocesses a tensor encoding a batch of images.
 
   Arguments:
-      x: symoblic tensor, 3D or 4D.
-      data_format: data format of the image tensor.
-      mode: One of "caffe", "tf".
+      x: Input tensor, 3D or 4D.
+      data_format: Data format of the image tensor.
+      mode: One of "caffe", "tf" or "torch".
           - caffe: will convert the images from RGB to BGR,
               then will zero-center each color channel with
               respect to the ImageNet dataset,
               without scaling.
           - tf: will scale pixels between -1 and 1,
               sample-wise.
+          - torch: will scale pixels between 0 and 1 and then
+              will normalize each channel with respect to the
+              ImageNet dataset.
 
   Returns:
       Preprocessed tensor.
@@ -103,32 +132,42 @@ def _preprocess_symbolic_input(x, data_format, mode):
     x -= 1.
     return x
 
-  if data_format == 'channels_first':
-    # 'RGB'->'BGR'
-    if K.ndim(x) == 3:
-      x = x[::-1, ...]
-    else:
-      x = x[:, ::-1, ...]
+  if mode == 'torch':
+    x /= 255.
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
   else:
-    # 'RGB'->'BGR'
-    x = x[..., ::-1]
+    if data_format == 'channels_first':
+      # 'RGB'->'BGR'
+      if K.ndim(x) == 3:
+        x = x[::-1, ...]
+      else:
+        x = x[:, ::-1, ...]
+    else:
+      # 'RGB'->'BGR'
+      x = x[..., ::-1]
+    mean = [103.939, 116.779, 123.68]
+    std = None
 
   if _IMAGENET_MEAN is None:
-    _IMAGENET_MEAN = K.constant(-np.array([103.939, 116.779, 123.68]))
+    _IMAGENET_MEAN = K.constant(-np.array(mean))
+
   # Zero-center by mean pixel
   if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
     x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
   else:
     x = K.bias_add(x, _IMAGENET_MEAN, data_format)
+  if std is not None:
+    x /= std
   return x
 
 
 def preprocess_input(x, data_format=None, mode='caffe'):
-  """Preprocesses a tensor encoding a batch of images.
+  """Preprocesses a tensor or Numpy array encoding a batch of images.
 
   Arguments:
-      x: input Numpy or symoblic tensor, 3D or 4D.
-      data_format: data format of the image tensor.
+      x: Input Numpy or symbolic tensor, 3D or 4D.
+      data_format: Data format of the image tensor/array.
       mode: One of "caffe", "tf".
           - caffe: will convert the images from RGB to BGR,
               then will zero-center each color channel with
@@ -138,10 +177,10 @@ def preprocess_input(x, data_format=None, mode='caffe'):
               sample-wise.
 
   Returns:
-      Preprocessed tensor.
+      Preprocessed tensor or Numpy array.
 
   Raises:
-      ValueError: in case of incorrect data_format.
+      ValueError: In case of unknown `data_format` argument.
   """
   if data_format is None:
     data_format = K.image_data_format()
@@ -159,7 +198,7 @@ def decode_predictions(preds, top=5):
 
   Arguments:
       preds: Numpy tensor encoding a batch of predictions.
-      top: integer, how many top-guesses to return.
+      top: Integer, how many top-guesses to return.
 
   Returns:
       A list of lists of top class prediction tuples
@@ -167,7 +206,7 @@ def decode_predictions(preds, top=5):
       One list of tuples per sample in batch input.
 
   Raises:
-      ValueError: in case of invalid shape of the `pred` array
+      ValueError: In case of invalid shape of the `pred` array
           (must be 2D).
   """
   global CLASS_INDEX
@@ -177,10 +216,11 @@ def decode_predictions(preds, top=5):
                      '(i.e. a 2D array of shape (samples, 1000)). '
                      'Found array with shape: ' + str(preds.shape))
   if CLASS_INDEX is None:
-    fpath = get_file('imagenet_class_index.json',
-                     CLASS_INDEX_PATH,
-                     cache_subdir='models',
-                     file_hash='c2c37ea517e94d9795004a39431a14cb')
+    fpath = get_file(
+        'imagenet_class_index.json',
+        CLASS_INDEX_PATH,
+        cache_subdir='models',
+        file_hash='c2c37ea517e94d9795004a39431a14cb')
     CLASS_INDEX = json.load(open(fpath))
   results = []
   for pred in preds:
@@ -197,17 +237,17 @@ def _obtain_input_shape(input_shape,
                         data_format,
                         require_flatten,
                         weights=None):
-  """Internal utility to compute/validate an ImageNet model's input shape.
+  """Internal utility to compute/validate a model's input shape.
 
   Arguments:
-      input_shape: either None (will return the default network input shape),
+      input_shape: Either None (will return the default network input shape),
           or a user-provided shape to be validated.
-      default_size: default input width/height for the model.
-      min_size: minimum input width/height accepted by the model.
-      data_format: image data format to use.
-      require_flatten: whether the model is expected to
+      default_size: Default input width/height for the model.
+      min_size: Minimum input width/height accepted by the model.
+      data_format: Image data format to use.
+      require_flatten: Whether the model is expected to
           be linked to a classifier via a Flatten layer.
-      weights: one of `None` (random initialization)
+      weights: One of `None` (random initialization)
           or 'imagenet' (pre-training on ImageNet).
           If weights='imagenet' input channels must be equal to 3.
 
@@ -215,7 +255,7 @@ def _obtain_input_shape(input_shape,
       An integer shape tuple (may include None entries).
 
   Raises:
-      ValueError: in case of invalid argument values.
+      ValueError: In case of invalid argument values.
   """
   if weights != 'imagenet' and input_shape and len(input_shape) == 3:
     if data_format == 'channels_first':
@@ -252,8 +292,8 @@ def _obtain_input_shape(input_shape,
                            '`input_shape=' + str(input_shape) + '`')
         if ((input_shape[1] is not None and input_shape[1] < min_size) or
             (input_shape[2] is not None and input_shape[2] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
-                           + str(min_size) + '; got '
+          raise ValueError('Input size must be at least ' + str(min_size) +
+                           'x' + str(min_size) + '; got '
                            '`input_shape=' + str(input_shape) + '`')
     else:
       if input_shape is not None:
@@ -264,8 +304,8 @@ def _obtain_input_shape(input_shape,
                            '`input_shape=' + str(input_shape) + '`')
         if ((input_shape[0] is not None and input_shape[0] < min_size) or
             (input_shape[1] is not None and input_shape[1] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
-                           + str(min_size) + '; got '
+          raise ValueError('Input size must be at least ' + str(min_size) +
+                           'x' + str(min_size) + '; got '
                            '`input_shape=' + str(input_shape) + '`')
   else:
     if require_flatten:
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
index 2e73cefb6ce32c2a770eb9bde5ffb220be2da92c..1dc15b5b3437718e285d694eb058ef124ee52c0b 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Inception-ResNet V2 model for Keras.
 
 # Reference
@@ -28,7 +30,7 @@ import os
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -43,6 +45,8 @@ from tensorflow.python.keras._impl.keras.layers import Lambda
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+
 
 BASE_WEIGHT_URL = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.7/'
 
@@ -116,7 +120,8 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
       scale: scaling factor to scale the residuals (i.e., the output of
           passing `x` through an inception module) before adding them
           to the shortcut branch. Let `r` be the output from the residual
-          branch, the output of this block will be `x + scale * r`.
+            branch,
+          the output of this block will be `x + scale * r`.
       block_type: `'block35'`, `'block17'` or `'block8'`, determines
           the network structure in the residual branch.
       block_idx: an `int` used for generating layer names. The Inception-ResNet
@@ -128,8 +133,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
           will have `block_type='block35', block_idx=0`, ane the layer names
             will have
           a common prefix `'block35_0'`.
-      activation: activation function to use at the end of the block
-          (see [activations](../activations.md)).
+      activation: activation function to use at the end of the block.
           When `activation=None`, no activation is applied
           (i.e., "linear" activation: `a(x) = x`).
 
@@ -178,6 +182,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
 
   x = Lambda(
       lambda inputs, scale: inputs[0] + inputs[1] * scale,
+      output_shape=K.int_shape(x)[1:],
       arguments={'scale': scale},
       name=block_name)([x, up])
   if activation is not None:
@@ -185,7 +190,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
   return x
 
 
-def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
+def InceptionResNetV2(include_top=True,
                       weights='imagenet',
                       input_tensor=None,
                       input_shape=None,
@@ -211,8 +216,8 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
index 4424b9280413bb8e556ab376b0c0acccf4030c73..ff57116f2dabed58e6993320425384ed55fde65b 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Inception V3 model for Keras.
 
 Note that the input image format for this model is different than for
@@ -35,7 +36,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -48,6 +49,7 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
@@ -92,7 +94,8 @@ def conv2d_bn(x,
       strides=strides,
       padding=padding,
       use_bias=False,
-      name=conv_name)(x)
+      name=conv_name)(
+          x)
   x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
   x = Activation('relu', name=name)(x)
   return x
@@ -109,7 +112,7 @@ def InceptionV3(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
   The model and the weights are compatible with both
   TensorFlow and Theano. The data format
@@ -121,15 +124,15 @@ def InceptionV3(include_top=True,
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(299, 299, 3)` (with `channels_last` data format)
           or `(3, 299, 299)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 139.
           E.g. `(150, 150, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -176,7 +179,10 @@ def InceptionV3(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
 
   if K.image_data_format() == 'channels_first':
     channel_axis = 1
@@ -389,6 +395,7 @@ def InceptionV3(include_top=True,
     model.load_weights(weights_path)
   elif weights is not None:
     model.load_weights(weights)
+
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 5f97c138fc038688a009dfa83b48c8f367ee8df2..790bf8ceade6039bbd651fce1960c04f9c51c63e 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
 """MobileNet v1 models for Keras.
 
 MobileNet is a general architecture and can be used for multiple use cases.
@@ -56,7 +58,7 @@ the 100 % MobileNet on various input sizes:
 ------------------------------------------------------------------------
 
 The weights for all 16 models are obtained and translated
-from Tensorflow checkpoints found at
+from TensorFlow checkpoints found at
 https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md
 
 # Reference
@@ -75,9 +77,10 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
@@ -91,6 +94,7 @@ from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 
+
 BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
 
 
@@ -130,7 +134,7 @@ class DepthwiseConv2D(Conv2D):
           all spatial dimensions.
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
+      padding: one of `'valid'` or `'same'` (case-insensitive).
       depth_multiplier: The number of depthwise convolution output channels
           for each input channel.
           The total number of depthwise convolution output
@@ -144,29 +148,21 @@ class DepthwiseConv2D(Conv2D):
           `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+          If you never set it, then it will be 'channels_last'.
+      activation: Activation function to use.
           If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          (ie. 'linear' activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      depthwise_initializer: Initializer for the depthwise kernel matrix.
+      bias_initializer: Initializer for the bias vector.
       depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the depthwise kernel matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its 'activation')..
       depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the depthwise kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
       4D tensor with shape:
@@ -216,6 +212,7 @@ class DepthwiseConv2D(Conv2D):
     self.depthwise_constraint = constraints.get(depthwise_constraint)
     self.bias_initializer = initializers.get(bias_initializer)
 
+  @shape_type_conversion
   def build(self, input_shape):
     if len(input_shape) < 4:
       raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
@@ -269,6 +266,7 @@ class DepthwiseConv2D(Conv2D):
 
     return outputs
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.data_format == 'channels_first':
       rows = input_shape[2]
@@ -305,7 +303,7 @@ class DepthwiseConv2D(Conv2D):
     return config
 
 
-def MobileNet(input_shape=None,  # pylint: disable=invalid-name
+def MobileNet(input_shape=None,
               alpha=1.0,
               depth_multiplier=1,
               dropout=1e-3,
@@ -334,7 +332,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or (3, 224, 224) (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 32.
           E.g. `(200, 200, 3)` would be one valid value.
       alpha: controls the width of the network.
@@ -350,8 +348,8 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of
           `layers.Input()`)
           to use as image input for the model.
@@ -380,6 +378,12 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       RuntimeError: If attempting to run this model with a
           backend that does not support separable convolutions.
   """
+
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('Only TensorFlow backend is currently supported, '
+                       'as other backends do not support '
+                       'depthwise convolution.')
+
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
@@ -390,7 +394,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
     raise ValueError('If using `weights` as ImageNet with `include_top` '
                      'as true, `classes` should be 1000')
 
-  # Determine proper input shape.
+  # Determine proper input shape and default size.
   if input_shape is None:
     default_size = 224
   else:
@@ -400,10 +404,12 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
     else:
       rows = input_shape[0]
       cols = input_shape[1]
+
     if rows == cols and rows in [128, 160, 192, 224]:
       default_size = rows
     else:
       default_size = 224
+
   input_shape = _obtain_input_shape(
       input_shape,
       default_size=default_size,
@@ -411,6 +417,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       data_format=K.image_data_format(),
       require_flatten=include_top,
       weights=weights)
+
   if K.image_data_format() == 'channels_last':
     row_axis, col_axis = (0, 1)
   else:
@@ -536,8 +543,6 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
-  elif weights is not None:
-    model.load_weights(weights)
   return model
 
 
@@ -595,7 +600,8 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
       padding='same',
       use_bias=False,
       strides=strides,
-      name='conv1')(inputs)
+      name='conv1')(
+          inputs)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return Activation(relu6, name='conv1_relu')(x)
 
@@ -662,7 +668,8 @@ def _depthwise_conv_block(inputs,
       depth_multiplier=depth_multiplier,
       strides=strides,
       use_bias=False,
-      name='conv_dw_%d' % block_id)(inputs)
+      name='conv_dw_%d' % block_id)(
+          inputs)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
   x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
 
@@ -671,6 +678,7 @@ def _depthwise_conv_block(inputs,
       padding='same',
       use_bias=False,
       strides=(1, 1),
-      name='conv_pw_%d' % block_id)(x)
+      name='conv_pw_%d' % block_id)(
+          x)
   x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
   return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet.py b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dd038c096f09422e381ba96006c3ebcd88481d9
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
@@ -0,0 +1,783 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
+"""NASNet-A models for Keras.
+
+NASNet refers to Neural Architecture Search Network, a family of models
+that were designed automatically by learning the model architectures
+directly on the dataset of interest.
+
+Here we consider NASNet-A, the highest performance model that was found
+for the CIFAR-10 dataset, and then extended to ImageNet 2012 dataset,
+obtaining state of the art performance on CIFAR-10 and ImageNet 2012.
+Only the NASNet-A models, and their respective weights, which are suited
+for ImageNet 2012 are provided.
+
+The below table describes the performance on ImageNet 2012:
+--------------------------------------------------------------------------------
+      Architecture       | Top-1 Acc | Top-5 Acc |  Multiply-Adds |  Params (M)
+--------------------------------------------------------------------------------
+|   NASNet-A (4 @ 1056)  |   74.0 %  |   91.6 %  |       564 M    |     5.3    |
+|   NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9    |
+--------------------------------------------------------------------------------
+
+References:
+ - [Learning Transferable Architectures for Scalable Image Recognition]
+    (https://arxiv.org/abs/1707.07012)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
+from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.layers import Activation
+from tensorflow.python.keras._impl.keras.layers import add
+from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import BatchNormalization
+from tensorflow.python.keras._impl.keras.layers import concatenate
+from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import Cropping2D
+from tensorflow.python.keras._impl.keras.layers import Dense
+from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+
+
+NASNET_MOBILE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile.h5'
+NASNET_MOBILE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile-no-top.h5'
+NASNET_LARGE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large.h5'
+NASNET_LARGE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large-no-top.h5'
+
+
+def NASNet(input_shape=None,
+           penultimate_filters=4032,
+           num_blocks=6,
+           stem_block_filters=96,
+           skip_reduction=True,
+           filter_multiplier=2,
+           include_top=True,
+           weights=None,
+           input_tensor=None,
+           pooling=None,
+           classes=1000,
+           default_size=None):
+  """Instantiates a NASNet model.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(331, 331, 3)` for NASNetLarge or
+          `(224, 224, 3)` for NASNetMobile
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      penultimate_filters: Number of filters in the penultimate layer.
+          NASNet models use the notation `NASNet (N @ P)`, where:
+              -   N is the number of blocks
+              -   P is the number of penultimate filters
+      num_blocks: Number of repeated blocks of the NASNet model.
+          NASNet models use the notation `NASNet (N @ P)`, where:
+              -   N is the number of blocks
+              -   P is the number of penultimate filters
+      stem_block_filters: Number of filters in the initial stem block
+      skip_reduction: Whether to skip the reduction step at the tail
+          end of the network. Set to `False` for CIFAR models.
+      filter_multiplier: Controls the width of the network.
+          - If `filter_multiplier` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `filter_multiplier` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `filter_multiplier` = 1, default number of filters from the
+               paper are used at each layer.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+      default_size: Specifies the default image size of the model
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: In case of invalid argument for `weights`,
+          invalid input shape or invalid `penultimate_filters` value.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('Only Tensorflow backend is currently supported, '
+                       'as other backends do not support '
+                       'separable convolution.')
+
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as ImageNet with `include_top` '
+                     'as true, `classes` should be 1000')
+
+  if default_size is None:
+    default_size = 331
+
+  # Determine proper input shape and default size.
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=default_size,
+      min_size=32,
+      data_format=K.image_data_format(),
+      require_flatten=include_top or weights,
+      weights=weights)
+
+  if K.image_data_format() != 'channels_last':
+    logging.warning('The NASNet family of models is only available '
+                    'for the input data format "channels_last" '
+                    '(width, height, channels). '
+                    'However your settings specify the default '
+                    'data format "channels_first" (channels, width, height).'
+                    ' You should set `image_data_format="channels_last"` '
+                    'in your Keras config located at ~/.keras/keras.json. '
+                    'The model being returned right now will expect inputs '
+                    'to follow the "channels_last" data format.')
+    K.set_image_data_format('channels_last')
+    old_data_format = 'channels_first'
+  else:
+    old_data_format = None
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  if penultimate_filters % 24 != 0:
+    raise ValueError(
+        'For NASNet-A models, the value of `penultimate_filters` '
+        'needs to be divisible by 24. Current value: %d' % penultimate_filters)
+
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+  filters = penultimate_filters // 24
+
+  if not skip_reduction:
+    x = Conv2D(
+        stem_block_filters, (3, 3),
+        strides=(2, 2),
+        padding='valid',
+        use_bias=False,
+        name='stem_conv1',
+        kernel_initializer='he_normal')(
+            img_input)
+  else:
+    x = Conv2D(
+        stem_block_filters, (3, 3),
+        strides=(1, 1),
+        padding='same',
+        use_bias=False,
+        name='stem_conv1',
+        kernel_initializer='he_normal')(
+            img_input)
+
+  x = BatchNormalization(
+      axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='stem_bn1')(
+          x)
+
+  p = None
+  if not skip_reduction:  # imagenet / mobile mode
+    x, p = _reduction_a_cell(
+        x, p, filters // (filter_multiplier**2), block_id='stem_1')
+    x, p = _reduction_a_cell(
+        x, p, filters // filter_multiplier, block_id='stem_2')
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(x, p, filters, block_id='%d' % (i))
+
+  x, p0 = _reduction_a_cell(
+      x, p, filters * filter_multiplier, block_id='reduce_%d' % (num_blocks))
+
+  p = p0 if not skip_reduction else p
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(
+        x, p, filters * filter_multiplier, block_id='%d' % (num_blocks + i + 1))
+
+  x, p0 = _reduction_a_cell(
+      x,
+      p,
+      filters * filter_multiplier**2,
+      block_id='reduce_%d' % (2 * num_blocks))
+
+  p = p0 if not skip_reduction else p
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(
+        x,
+        p,
+        filters * filter_multiplier**2,
+        block_id='%d' % (2 * num_blocks + i + 1))
+
+  x = Activation('relu')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D()(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  model = Model(inputs, x, name='NASNet')
+
+  # load weights
+  if weights == 'imagenet':
+    if default_size == 224:  # mobile version
+      if include_top:
+        weight_path = NASNET_MOBILE_WEIGHT_PATH
+        model_name = 'nasnet_mobile.h5'
+      else:
+        weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP
+        model_name = 'nasnet_mobile_no_top.h5'
+
+      weights_file = get_file(model_name, weight_path, cache_subdir='models')
+      model.load_weights(weights_file)
+
+    elif default_size == 331:  # large version
+      if include_top:
+        weight_path = NASNET_LARGE_WEIGHT_PATH
+        model_name = 'nasnet_large.h5'
+      else:
+        weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP
+        model_name = 'nasnet_large_no_top.h5'
+
+      weights_file = get_file(model_name, weight_path, cache_subdir='models')
+      model.load_weights(weights_file)
+    else:
+      raise ValueError('ImageNet weights can only be loaded with NASNetLarge'
+                       ' or NASNetMobile')
+  elif weights is not None:
+    model.load_weights(weights)
+
+  if old_data_format:
+    K.set_image_data_format(old_data_format)
+
+  return model
+
+
+def NASNetLarge(input_shape=None,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                pooling=None,
+                classes=1000):
+  """Instantiates a NASNet model in ImageNet mode.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(331, 331, 3)` for NASNetLarge.
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  return NASNet(
+      input_shape,
+      penultimate_filters=4032,
+      num_blocks=6,
+      stem_block_filters=96,
+      skip_reduction=False,
+      filter_multiplier=2,
+      include_top=include_top,
+      weights=weights,
+      input_tensor=input_tensor,
+      pooling=pooling,
+      classes=classes,
+      default_size=331)
+
+
+def NASNetMobile(input_shape=None,
+                 include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 pooling=None,
+                 classes=1000):
+  """Instantiates a Mobile NASNet model in ImageNet mode.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` for NASNetMobile
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: In case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  return NASNet(
+      input_shape,
+      penultimate_filters=1056,
+      num_blocks=4,
+      stem_block_filters=32,
+      skip_reduction=False,
+      filter_multiplier=2,
+      include_top=include_top,
+      weights=weights,
+      input_tensor=input_tensor,
+      pooling=pooling,
+      classes=classes,
+      default_size=224)
+
+
+def _separable_conv_block(ip,
+                          filters,
+                          kernel_size=(3, 3),
+                          strides=(1, 1),
+                          block_id=None):
+  """Adds 2 blocks of [relu-separable conv-batchnorm].
+
+  Arguments:
+      ip: Input tensor
+      filters: Number of output filters per layer
+      kernel_size: Kernel size of separable convolutions
+      strides: Strided convolution for downsampling
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('separable_conv_block_%s' % block_id):
+    x = Activation('relu')(ip)
+    x = SeparableConv2D(
+        filters,
+        kernel_size,
+        strides=strides,
+        name='separable_conv_1_%s' % block_id,
+        padding='same',
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            x)
+    x = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='separable_conv_1_bn_%s' % (block_id))(
+            x)
+    x = Activation('relu')(x)
+    x = SeparableConv2D(
+        filters,
+        kernel_size,
+        name='separable_conv_2_%s' % block_id,
+        padding='same',
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            x)
+    x = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='separable_conv_2_bn_%s' % (block_id))(
+            x)
+  return x
+
+
+def _adjust_block(p, ip, filters, block_id=None):
+  """Adjusts the input `previous path` to match the shape of the `input`.
+
+  Used in situations where the output number of filters needs to be changed.
+
+  Arguments:
+      p: Input tensor which needs to be modified
+      ip: Input tensor whose shape needs to be matched
+      filters: Number of output filters to be matched
+      block_id: String block_id
+
+  Returns:
+      Adjusted Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+  img_dim = 2 if K.image_data_format() == 'channels_first' else -2
+
+  ip_shape = K.int_shape(ip)
+
+  if p is not None:
+    p_shape = K.int_shape(p)
+
+  with K.name_scope('adjust_block'):
+    if p is None:
+      p = ip
+
+    elif p_shape[img_dim] != ip_shape[img_dim]:
+      with K.name_scope('adjust_reduction_block_%s' % block_id):
+        p = Activation('relu', name='adjust_relu_1_%s' % block_id)(p)
+
+        p1 = AveragePooling2D(
+            (1, 1),
+            strides=(2, 2),
+            padding='valid',
+            name='adjust_avg_pool_1_%s' % block_id)(
+                p)
+        p1 = Conv2D(
+            filters // 2, (1, 1),
+            padding='same',
+            use_bias=False,
+            name='adjust_conv_1_%s' % block_id,
+            kernel_initializer='he_normal')(
+                p1)
+
+        p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
+        p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2)
+        p2 = AveragePooling2D(
+            (1, 1),
+            strides=(2, 2),
+            padding='valid',
+            name='adjust_avg_pool_2_%s' % block_id)(
+                p2)
+        p2 = Conv2D(
+            filters // 2, (1, 1),
+            padding='same',
+            use_bias=False,
+            name='adjust_conv_2_%s' % block_id,
+            kernel_initializer='he_normal')(
+                p2)
+
+        p = concatenate([p1, p2], axis=channel_dim)
+        p = BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name='adjust_bn_%s' % block_id)(
+                p)
+
+    elif p_shape[channel_dim] != filters:
+      with K.name_scope('adjust_projection_block_%s' % block_id):
+        p = Activation('relu')(p)
+        p = Conv2D(
+            filters, (1, 1),
+            strides=(1, 1),
+            padding='same',
+            name='adjust_conv_projection_%s' % block_id,
+            use_bias=False,
+            kernel_initializer='he_normal')(
+                p)
+        p = BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name='adjust_bn_%s' % block_id)(
+                p)
+  return p
+
+
+def _normal_a_cell(ip, p, filters, block_id=None):
+  """Adds a Normal cell for NASNet-A (Fig. 4 in the paper).
+
+  Arguments:
+      ip: Input tensor `x`
+      p: Input tensor `p`
+      filters: Number of output filters
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('normal_A_block_%s' % block_id):
+    p = _adjust_block(p, ip, filters, block_id)
+
+    h = Activation('relu')(ip)
+    h = Conv2D(
+        filters, (1, 1),
+        strides=(1, 1),
+        padding='same',
+        name='normal_conv_1_%s' % block_id,
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            h)
+    h = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='normal_bn_1_%s' % block_id)(
+            h)
+
+    with K.name_scope('block_1'):
+      x1_1 = _separable_conv_block(
+          h, filters, kernel_size=(5, 5), block_id='normal_left1_%s' % block_id)
+      x1_2 = _separable_conv_block(
+          p, filters, block_id='normal_right1_%s' % block_id)
+      x1 = add([x1_1, x1_2], name='normal_add_1_%s' % block_id)
+
+    with K.name_scope('block_2'):
+      x2_1 = _separable_conv_block(
+          p, filters, (5, 5), block_id='normal_left2_%s' % block_id)
+      x2_2 = _separable_conv_block(
+          p, filters, (3, 3), block_id='normal_right2_%s' % block_id)
+      x2 = add([x2_1, x2_2], name='normal_add_2_%s' % block_id)
+
+    with K.name_scope('block_3'):
+      x3 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_left3_%s' % (block_id))(
+              h)
+      x3 = add([x3, p], name='normal_add_3_%s' % block_id)
+
+    with K.name_scope('block_4'):
+      x4_1 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_left4_%s' % (block_id))(
+              p)
+      x4_2 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_right4_%s' % (block_id))(
+              p)
+      x4 = add([x4_1, x4_2], name='normal_add_4_%s' % block_id)
+
+    with K.name_scope('block_5'):
+      x5 = _separable_conv_block(
+          h, filters, block_id='normal_left5_%s' % block_id)
+      x5 = add([x5, h], name='normal_add_5_%s' % block_id)
+
+    x = concatenate(
+        [p, x1, x2, x3, x4, x5],
+        axis=channel_dim,
+        name='normal_concat_%s' % block_id)
+  return x, ip
+
+
+def _reduction_a_cell(ip, p, filters, block_id=None):
+  """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper).
+
+  Arguments:
+      ip: Input tensor `x`
+      p: Input tensor `p`
+      filters: Number of output filters
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('reduction_A_block_%s' % block_id):
+    p = _adjust_block(p, ip, filters, block_id)
+
+    h = Activation('relu')(ip)
+    h = Conv2D(
+        filters, (1, 1),
+        strides=(1, 1),
+        padding='same',
+        name='reduction_conv_1_%s' % block_id,
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            h)
+    h = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='reduction_bn_1_%s' % block_id)(
+            h)
+
+    with K.name_scope('block_1'):
+      x1_1 = _separable_conv_block(
+          h,
+          filters, (5, 5),
+          strides=(2, 2),
+          block_id='reduction_left1_%s' % block_id)
+      x1_2 = _separable_conv_block(
+          p,
+          filters, (7, 7),
+          strides=(2, 2),
+          block_id='reduction_1_%s' % block_id)
+      x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % block_id)
+
+    with K.name_scope('block_2'):
+      x2_1 = MaxPooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_left2_%s' % block_id)(
+              h)
+      x2_2 = _separable_conv_block(
+          p,
+          filters, (7, 7),
+          strides=(2, 2),
+          block_id='reduction_right2_%s' % block_id)
+      x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % block_id)
+
+    with K.name_scope('block_3'):
+      x3_1 = AveragePooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_left3_%s' % block_id)(
+              h)
+      x3_2 = _separable_conv_block(
+          p,
+          filters, (5, 5),
+          strides=(2, 2),
+          block_id='reduction_right3_%s' % block_id)
+      x3 = add([x3_1, x3_2], name='reduction_add3_%s' % block_id)
+
+    with K.name_scope('block_4'):
+      x4 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='reduction_left4_%s' % block_id)(
+              x1)
+      x4 = add([x2, x4])
+
+    with K.name_scope('block_5'):
+      x5_1 = _separable_conv_block(
+          x1, filters, (3, 3), block_id='reduction_left4_%s' % block_id)
+      x5_2 = MaxPooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_right5_%s' % block_id)(
+              h)
+      x5 = add([x5_1, x5_2], name='reduction_add4_%s' % block_id)
+
+    x = concatenate(
+        [x2, x3, x4, x5],
+        axis=channel_dim,
+        name='reduction_concat_%s' % block_id)
+    return x, ip
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py b/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa1dec670cb995e47bdcf88bd69594c532781b18
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nasnet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class NASNetMobileTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.NASNetMobile(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.NASNetMobile(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1056))
+
+  def test_with_pooling(self):
+    model = keras.applications.NASNetMobile(weights=None,
+                                            include_top=False,
+                                            pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1056))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetMobile(weights='unknown',
+                                      include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetMobile(weights='imagenet',
+                                      classes=2000)
+
+
+class NASNetLargeTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.NASNetLarge(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.NASNetLarge(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 4032))
+
+  def test_with_pooling(self):
+    model = keras.applications.NASNetLarge(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 4032))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetLarge(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetLarge(weights='imagenet',
+                                     classes=2000)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 8ab46693aa6e46de6c6df320c745ca9ed01fbe0b..5705b3481afac4fe8f50d91c3099db5d4fb63878 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """ResNet50 model for Keras.
 
 # Reference:
@@ -31,8 +32,8 @@ import os
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -45,7 +46,9 @@ from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
@@ -78,7 +81,8 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
   x = Activation('relu')(x)
 
   x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
+          x)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
   x = Activation('relu')(x)
 
@@ -92,7 +96,7 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
 
 def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
                                                                           2)):
-  """conv_block is the block that has a conv layer at shortcut.
+  """A block that has a conv layer at shortcut.
 
   Arguments:
       input_tensor: input tensor
@@ -100,14 +104,14 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
       filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
-      strides: Tuple of integers.
+      strides: Strides for the first conv layer in the block.
 
   Returns:
       Output tensor for the block.
 
-  Note that from stage 3, the first conv layer at main path is with
-  strides=(2,2)
-  And the shortcut should have strides=(2,2) as well
+  Note that from stage 3,
+  the first conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
   """
   filters1, filters2, filters3 = filters
   if K.image_data_format() == 'channels_last':
@@ -118,13 +122,14 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   bn_name_base = 'bn' + str(stage) + block + '_branch'
 
   x = Conv2D(
-      filters1, (1, 1), strides=strides,
-      name=conv_name_base + '2a')(input_tensor)
+      filters1, (1, 1), strides=strides, name=conv_name_base + '2a')(
+          input_tensor)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
   x = Activation('relu')(x)
 
   x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
+          x)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
   x = Activation('relu')(x)
 
@@ -132,8 +137,8 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
 
   shortcut = Conv2D(
-      filters3, (1, 1), strides=strides,
-      name=conv_name_base + '1')(input_tensor)
+      filters3, (1, 1), strides=strides, name=conv_name_base + '1')(
+          input_tensor)
   shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
 
   x = layers.add([x, shortcut])
@@ -152,7 +157,7 @@ def ResNet50(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -164,15 +169,15 @@ def ResNet50(include_top=True,
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 197.
           E.g. `(200, 200, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -219,15 +224,18 @@ def ResNet50(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   if K.image_data_format() == 'channels_last':
     bn_axis = 3
   else:
     bn_axis = 1
 
-  x = Conv2D(64, (7, 7),
-             strides=(2, 2), padding='same', name='conv1')(img_input)
+  x = Conv2D(
+      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(
+          img_input)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
@@ -289,4 +297,5 @@ def ResNet50(include_top=True,
     model.load_weights(weights_path)
   elif weights is not None:
     model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index 38dbbdc809e708cc19d5529665352fe4807fad90..c91c24e6fbed100c55d38432b2338a51213f64d2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """VGG16 model for Keras.
 
 # Reference
@@ -29,8 +30,8 @@ import os
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
@@ -42,6 +43,7 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
@@ -59,7 +61,7 @@ def VGG16(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -71,8 +73,8 @@ def VGG16(include_top=True,
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -125,48 +127,62 @@ def VGG16(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   # Block 1
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same',
-      name='block1_conv1')(img_input)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
+          img_input)
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
 
   # Block 2
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
+          x)
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
 
   # Block 3
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
 
   # Block 4
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
 
   # Block 5
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
 
   if include_top:
@@ -215,6 +231,8 @@ def VGG16(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+
   elif weights is not None:
     model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index 126c64260b51a7d4e6ca653e850e22c03799dcb0..223cd79d7bfb660098273e46444d9588ae10f7a3 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """VGG19 model for Keras.
 
 # Reference
@@ -29,8 +30,8 @@ import os
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
@@ -42,6 +43,7 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels.h5'
@@ -59,7 +61,7 @@ def VGG19(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -71,15 +73,15 @@ def VGG19(include_top=True,
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
       weights: one of `None` (random initialization),
-         'imagenet' (pre-training on ImageNet),
-         or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 48.
           E.g. `(200, 200, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -125,54 +127,71 @@ def VGG19(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   # Block 1
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same',
-      name='block1_conv1')(img_input)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
+          img_input)
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
 
   # Block 2
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
+          x)
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
 
   # Block 3
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
 
   # Block 4
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
 
   # Block 5
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
 
   if include_top:
@@ -211,6 +230,8 @@ def VGG19(include_top=True,
           cache_subdir='models',
           file_hash='253f8cb515780f3b799900260a226db6')
     model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      layer_utils.convert_all_kernels_in_model(model)
 
     if K.image_data_format() == 'channels_first':
       if include_top:
@@ -219,6 +240,8 @@ def VGG19(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+
   elif weights is not None:
     model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index 821983140852b9f1ab505376d824db2392f54391..0a6eb4953a7ab82635f5b99ca021898876c02052 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Xception V1 model for Keras.
 
 On ImageNet, this model gets to a top-1 validation accuracy of 0.790
@@ -42,7 +43,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
@@ -74,7 +75,7 @@ def Xception(include_top=True,
   on ImageNet. This model is available for TensorFlow only,
   and can only be used with inputs following the TensorFlow
   data format `(width, height, channels)`.
-  You should set `image_data_format="channels_last"` in your Keras config
+  You should set `image_data_format='channels_last'` in your Keras config
   located at ~/.keras/keras.json.
 
   Note that the default input image size for this model is 299x299.
@@ -83,14 +84,14 @@ def Xception(include_top=True,
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(299, 299, 3)`.
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 71.
           E.g. `(150, 150, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -155,11 +156,14 @@ def Xception(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
 
   x = Conv2D(
-      32, (3, 3), strides=(2, 2), use_bias=False,
-      name='block1_conv1')(img_input)
+      32, (3, 3), strides=(2, 2), use_bias=False, name='block1_conv1')(
+          img_input)
   x = BatchNormalization(name='block1_conv1_bn')(x)
   x = Activation('relu', name='block1_conv1_act')(x)
   x = Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
@@ -167,53 +171,65 @@ def Xception(include_top=True,
   x = Activation('relu', name='block1_conv2_act')(x)
 
   residual = Conv2D(
-      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(x)
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(
+          x)
   x = BatchNormalization(name='block2_sepconv1_bn')(x)
   x = Activation('relu', name='block2_sepconv2_act')(x)
   x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(x)
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(
+          x)
   x = BatchNormalization(name='block2_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(
+          x)
   x = layers.add([x, residual])
 
   residual = Conv2D(
-      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block3_sepconv1_act')(x)
   x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(x)
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(
+          x)
   x = BatchNormalization(name='block3_sepconv1_bn')(x)
   x = Activation('relu', name='block3_sepconv2_act')(x)
   x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(x)
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(
+          x)
   x = BatchNormalization(name='block3_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(
+          x)
   x = layers.add([x, residual])
 
   residual = Conv2D(
-      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block4_sepconv1_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(
+          x)
   x = BatchNormalization(name='block4_sepconv1_bn')(x)
   x = Activation('relu', name='block4_sepconv2_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(
+          x)
   x = BatchNormalization(name='block4_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(
+          x)
   x = layers.add([x, residual])
 
   for i in range(8):
@@ -222,46 +238,52 @@ def Xception(include_top=True,
 
     x = Activation('relu', name=prefix + '_sepconv1_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv1')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv1')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv1_bn')(x)
     x = Activation('relu', name=prefix + '_sepconv2_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv2')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv2')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv2_bn')(x)
     x = Activation('relu', name=prefix + '_sepconv3_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv3')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv3')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv3_bn')(x)
 
     x = layers.add([x, residual])
 
   residual = Conv2D(
-      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block13_sepconv1_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(
+          x)
   x = BatchNormalization(name='block13_sepconv1_bn')(x)
   x = Activation('relu', name='block13_sepconv2_act')(x)
   x = SeparableConv2D(
-      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(x)
+      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(
+          x)
   x = BatchNormalization(name='block13_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(
+          x)
   x = layers.add([x, residual])
 
   x = SeparableConv2D(
-      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(x)
+      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(
+          x)
   x = BatchNormalization(name='block14_sepconv1_bn')(x)
   x = Activation('relu', name='block14_sepconv1_act')(x)
 
   x = SeparableConv2D(
-      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(x)
+      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(
+          x)
   x = BatchNormalization(name='block14_sepconv2_bn')(x)
   x = Activation('relu', name='block14_sepconv2_act')(x)
 
@@ -303,8 +325,6 @@ def Xception(include_top=True,
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
-  elif weights is not None:
-    model.load_weights(weights)
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 9476085bd8cbc36f63d3c6c8ecad732b557a4f8a..098ea063f951ef86c2a474aa50d6239b514cc699 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -29,6 +29,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
@@ -85,7 +86,7 @@ _MANUAL_VAR_INIT = False
 _FLOATX = 'float32'
 
 # Epsilon fuzz factor used throughout the codebase.
-_EPSILON = 10e-8
+_EPSILON = 1e-7
 
 # Default image data format, one of "channels_last", "channels_first".
 _IMAGE_DATA_FORMAT = 'channels_last'
@@ -116,7 +117,7 @@ def epsilon():
   Example:
   ```python
       >>> keras.backend.epsilon()
-      1e-08
+      1e-07
   ```
   """
   return _EPSILON
@@ -132,7 +133,7 @@ def set_epsilon(value):
   ```python
       >>> from keras import backend as K
       >>> K.epsilon()
-      1e-08
+      1e-07
       >>> K.set_epsilon(1e-05)
       >>> K.epsilon()
       1e-05
@@ -295,7 +296,8 @@ def clear_session():
   ops.reset_default_graph()
   reset_uids()
   _SESSION = None
-  phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+  phase = array_ops.placeholder_with_default(
+      False, shape=(), name='keras_learning_phase')
   _GRAPH_LEARNING_PHASES = {}
   _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
 
@@ -325,10 +327,19 @@ def learning_phase():
 
   Returns:
       Learning phase (scalar integer tensor or Python integer).
+
+  Raises:
+      ValueError: If called when Eager execution is enabled.
   """
+  if context.in_eager_mode():
+    if 'eager' not in _GRAPH_LEARNING_PHASES:
+      raise ValueError('No learning phase set in Eager mode.')
+    return _GRAPH_LEARNING_PHASES['eager']
+
   graph = ops.get_default_graph()
   if graph not in _GRAPH_LEARNING_PHASES:
-    phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+    phase = array_ops.placeholder_with_default(
+        False, shape=(), name='keras_learning_phase')
     _GRAPH_LEARNING_PHASES[graph] = phase
   return _GRAPH_LEARNING_PHASES[graph]
 
@@ -345,7 +356,10 @@ def set_learning_phase(value):
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
     raise ValueError('Expected learning phase to be ' '0 or 1.')
-  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
+  if context.in_eager_mode():
+    _GRAPH_LEARNING_PHASES['eager'] = value
+  else:
+    _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
 
 
 def get_session():
@@ -876,6 +890,8 @@ def zeros(shape, dtype=None, name=None):
 
   Returns:
       A variable (including Keras metadata), filled with `0.0`.
+      Note that if `shape` was symbolic, we cannot return a variable,
+      and will return a dynamically-shaped tensor instead.
 
   Example:
   ```python
@@ -890,12 +906,14 @@ def zeros(shape, dtype=None, name=None):
   if dtype is None:
     dtype = floatx()
   tf_dtype = dtypes_module.as_dtype(dtype)
-  return variable(
-      init_ops.constant_initializer(0., dtype=tf_dtype)(shape), dtype, name)
+  v = array_ops.zeros(shape=shape, dtype=tf_dtype, name=name)
+  if py_all(v.get_shape().as_list()):
+    return variable(v, dtype=dtype, name=name)
+  return v
 
 
 def ones(shape, dtype=None, name=None):
-  """Instantiates an all-ones tensor variable and returns it.
+  """Instantiates an all-ones variable and returns it.
 
   Arguments:
       shape: Tuple of integers, shape of returned Keras variable.
@@ -904,6 +922,8 @@ def ones(shape, dtype=None, name=None):
 
   Returns:
       A Keras variable, filled with `1.0`.
+      Note that if `shape` was symbolic, we cannot return a variable,
+      and will return a dynamically-shaped tensor instead.
 
   Example:
   ```python
@@ -918,8 +938,10 @@ def ones(shape, dtype=None, name=None):
   if dtype is None:
     dtype = floatx()
   tf_dtype = dtypes_module.as_dtype(dtype)
-  return variable(
-      init_ops.constant_initializer(1., dtype=tf_dtype)(shape), dtype, name)
+  v = array_ops.ones(shape=shape, dtype=tf_dtype, name=name)
+  if py_all(v.get_shape().as_list()):
+    return variable(v, dtype=dtype, name=name)
+  return v
 
 
 def eye(size, dtype=None, name=None):
@@ -1185,7 +1207,7 @@ def moving_average_update(x, value, momentum):
       An Operation to update the variable.
   """
   return moving_averages.assign_moving_average(
-      x, value, momentum, zero_debias=False)
+      x, value, momentum, zero_debias=True)
 
 
 # LINEAR ALGEBRA
@@ -1419,7 +1441,7 @@ def max(x, axis=None, keepdims=False):
   Returns:
       A tensor with maximum values of `x`.
   """
-  return math_ops.reduce_max(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_max(x, axis, keepdims)
 
 
 def min(x, axis=None, keepdims=False):
@@ -1436,7 +1458,7 @@ def min(x, axis=None, keepdims=False):
   Returns:
       A tensor with miminum values of `x`.
   """
-  return math_ops.reduce_min(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_min(x, axis, keepdims)
 
 
 def sum(x, axis=None, keepdims=False):
@@ -1453,7 +1475,7 @@ def sum(x, axis=None, keepdims=False):
   Returns:
       A tensor with sum of `x`.
   """
-  return math_ops.reduce_sum(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_sum(x, axis, keepdims)
 
 
 def prod(x, axis=None, keepdims=False):
@@ -1470,7 +1492,7 @@ def prod(x, axis=None, keepdims=False):
   Returns:
       A tensor with the product of elements of `x`.
   """
-  return math_ops.reduce_prod(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_prod(x, axis, keepdims)
 
 
 def cumsum(x, axis=0):
@@ -1515,10 +1537,10 @@ def var(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, axis=axis, keep_dims=True)
+  m = math_ops.reduce_mean(x, axis, True)
   devs_squared = math_ops.square(x - m)
   return math_ops.reduce_mean(
-      devs_squared, axis=axis, keep_dims=keepdims)
+      devs_squared, axis, keepdims)
 
 
 def std(x, axis=None, keepdims=False):
@@ -1546,7 +1568,7 @@ def mean(x, axis=None, keepdims=False):
       axis: A list of integer. Axes to compute the mean.
       keepdims: A boolean, whether to keep the dimensions or not.
           If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1 for each entry in `axis`. If `keep_dims` is `True`,
+          by 1 for each entry in `axis`. If `keepdims` is `True`,
           the reduced dimensions are retained with length 1.
 
   Returns:
@@ -1554,7 +1576,7 @@ def mean(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  return math_ops.reduce_mean(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_mean(x, axis, keepdims)
 
 
 def any(x, axis=None, keepdims=False):
@@ -1569,7 +1591,7 @@ def any(x, axis=None, keepdims=False):
       A uint8 tensor (0s and 1s).
   """
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_any(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_any(x, axis, keepdims)
 
 
 def all(x, axis=None, keepdims=False):
@@ -1584,7 +1606,7 @@ def all(x, axis=None, keepdims=False):
       A uint8 tensor (0s and 1s).
   """
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_all(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_all(x, axis, keepdims)
 
 
 def argmax(x, axis=-1):
@@ -1694,7 +1716,7 @@ def logsumexp(x, axis=None, keepdims=False):
   Returns:
       The reduced tensor.
   """
-  return math_ops.reduce_logsumexp(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_logsumexp(x, axis, keepdims)
 
 
 def round(x):
@@ -1884,6 +1906,108 @@ def cos(x):
   return math_ops.cos(x)
 
 
+def _regular_normalize_batch_in_training(x,
+                                         gamma,
+                                         beta,
+                                         reduction_axes,
+                                         epsilon=1e-3):
+  """Non-fused version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(x, reduction_axes, None, None, False)
+  normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  return normed, mean, var
+
+
+def _broadcast_normalize_batch_in_training(x,
+                                           gamma,
+                                           beta,
+                                           reduction_axes,
+                                           epsilon=1e-3):
+  """Non-fused, broadcast version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(x, reduction_axes, None, None, False)
+  target_shape = []
+  for axis in range(ndim(x)):
+    if axis in reduction_axes:
+      target_shape.append(1)
+    else:
+      target_shape.append(array_ops.shape(x)[axis])
+  target_shape = array_ops.stack(target_shape)
+
+  broadcast_mean = array_ops.reshape(mean, target_shape)
+  broadcast_var = array_ops.reshape(var, target_shape)
+  if gamma is None:
+    broadcast_gamma = None
+  else:
+    broadcast_gamma = array_ops.reshape(gamma, target_shape)
+  if beta is None:
+    broadcast_beta = None
+  else:
+    broadcast_beta = array_ops.reshape(beta, target_shape)
+
+  normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
+                                  broadcast_beta, broadcast_gamma, epsilon)
+  return normed, mean, var
+
+
+def _fused_normalize_batch_in_training(x,
+                                       gamma,
+                                       beta,
+                                       reduction_axes,
+                                       epsilon=1e-3):
+  """Fused version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  if list(reduction_axes) == [0, 1, 2]:
+    normalization_axis = 3
+    tf_data_format = 'NHWC'
+  else:
+    normalization_axis = 1
+    tf_data_format = 'NCHW'
+
+  if gamma is None:
+    gamma = constant_op.constant(
+        1.0, dtype=x.dtype, shape=[x.get_shape()[normalization_axis]])
+  if beta is None:
+    beta = constant_op.constant(
+        0.0, dtype=x.dtype, shape=[x.get_shape()[normalization_axis]])
+
+  return nn.fused_batch_norm(
+      x, gamma, beta, epsilon=epsilon, data_format=tf_data_format)
+
+
 def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """Computes mean and std for batch then apply batch_normalization on batch.
 
@@ -1898,33 +2022,19 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   Returns:
       A tuple length of 3, `(normalized_tensor, mean, variance)`.
   """
-  mean, var = nn.moments(
-      x, reduction_axes, shift=None, name=None, keep_dims=False)
-  if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
-    normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  if ndim(x) == 4 and list(reduction_axes) in [[0, 1, 2], [0, 2, 3]]:
+    if not _has_nchw_support() and list(reduction_axes) == [0, 2, 3]:
+      return _broadcast_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
+    return _fused_normalize_batch_in_training(
+        x, gamma, beta, reduction_axes, epsilon=epsilon)
   else:
-    # need broadcasting
-    target_shape = []
-    for axis in range(ndim(x)):
-      if axis in reduction_axes:
-        target_shape.append(1)
-      else:
-        target_shape.append(array_ops.shape(x)[axis])
-    target_shape = array_ops.stack(target_shape)
-
-    broadcast_mean = array_ops.reshape(mean, target_shape)
-    broadcast_var = array_ops.reshape(var, target_shape)
-    if gamma is None:
-      broadcast_gamma = None
-    else:
-      broadcast_gamma = array_ops.reshape(gamma, target_shape)
-    if beta is None:
-      broadcast_beta = None
+    if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
+      return _regular_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
     else:
-      broadcast_beta = array_ops.reshape(beta, target_shape)
-    normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
-                                    broadcast_beta, broadcast_gamma, epsilon)
-  return normed, mean, var
+      return _broadcast_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
 
 
 def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
@@ -2619,7 +2729,8 @@ def rnn(step_function,
         go_backwards=False,
         mask=None,
         constants=None,
-        unroll=False):
+        unroll=False,
+        input_length=None):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -2648,6 +2759,7 @@ def rnn(step_function,
       constants: a list of constant values passed at each step.
       unroll: whether to unroll the RNN or to use a symbolic loop
           (`while_loop` or `scan` depending on backend).
+      input_length: Unused; exists for API compatibility.
 
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
@@ -2665,6 +2777,7 @@ def rnn(step_function,
       ValueError: if `mask` is provided (not `None`) but states is not provided
           (`len(states)` == 0).
   """
+  del input_length
   ndim = len(inputs.get_shape())
   if ndim < 3:
     raise ValueError('Input should be at least 3D.')
@@ -3016,7 +3129,7 @@ def elu(x, alpha=1.):
 
   Arguments:
       x: A tensor or variable to compute the activation function for.
-      alpha: A scalar, slope of positive section.
+      alpha: A scalar, slope of negative section.
 
   Returns:
       A tensor.
@@ -3083,7 +3196,7 @@ def categorical_crossentropy(target, output, from_logits=False):
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
     output /= math_ops.reduce_sum(
-        output, axis=len(output.get_shape()) - 1, keep_dims=True)
+        output, len(output.get_shape()) - 1, True)
     # manual computation of crossentropy
     epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
     output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
@@ -3248,6 +3361,25 @@ def in_top_k(predictions, targets, k):
 # CONVOLUTIONS
 
 
+def _preprocess_conv1d_input(x, data_format):
+  """Transpose and cast the input before the conv1d.
+
+  Arguments:
+      x: input tensor.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+
+  Returns:
+      A tensor.
+  """
+  tf_data_format = 'NHWC'  # to pass TF Conv2dNative operations
+  if data_format == 'channels_first':
+    if not _has_nchw_support():
+      x = array_ops.transpose(x, (0, 2, 1))  # NCW -> NWC
+    else:
+      tf_data_format = 'NCHW'
+  return x, tf_data_format
+
+
 def _preprocess_conv2d_input(x, data_format):
   """Transpose and cast the input before the conv2d.
 
@@ -3461,6 +3593,66 @@ def conv2d_transpose(x,
   return x
 
 
+def separable_conv1d(x,
+                     depthwise_kernel,
+                     pointwise_kernel,
+                     strides=1,
+                     padding='valid',
+                     data_format=None,
+                     dilation_rate=1):
+  """1D convolution with separable filters.
+
+  Arguments:
+      x: input tensor
+      depthwise_kernel: convolution kernel for the depthwise convolution.
+      pointwise_kernel: kernel for the 1x1 convolution.
+      strides: stride integer.
+      padding: string, `"same"` or `"valid"`.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+      dilation_rate: integer dilation rate.
+
+  Returns:
+      Output tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  x, tf_data_format = _preprocess_conv1d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  if tf_data_format == 'NHWC':
+    spatial_start_dim = 1
+    strides = (1, 1) + strides + (1,)
+  else:
+    spatial_start_dim = 2
+    strides = (1, 1, 1) + strides
+  x = array_ops.expand_dims(x, spatial_start_dim)
+  depthwise_kernel = array_ops.expand_dims(depthwise_kernel, 0)
+  pointwise_kernel = array_ops.expand_dims(pointwise_kernel, 0)
+  dilation_rate = (1,) + dilation_rate
+
+  x = nn.separable_conv2d(
+      x,
+      depthwise_kernel,
+      pointwise_kernel,
+      strides=strides,
+      padding=padding,
+      rate=dilation_rate,
+      data_format=tf_data_format)
+
+  x = array_ops.squeeze(x, [spatial_start_dim])
+
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 2, 1))  # NWC -> NCW
+
+  return x
+
+
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -3921,7 +4113,10 @@ def bias_add(x, bias, data_format=None):
   elif ndim(x) == 4:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, bias_shape[0], 1, 1))
+        if _has_nchw_support():
+          x = nn.bias_add(x, bias, data_format='NCHW')
+        else:
+          x += reshape(bias, (1, bias_shape[0], 1, 1))
       else:
         x += reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
     elif data_format == 'channels_last':
@@ -4113,7 +4308,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   sparse_labels = math_ops.to_int32(
       ctc_label_dense_to_sparse(y_true, label_length))
 
-  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
 
   return array_ops.expand_dims(
       ctc.ctc_loss(
@@ -4148,7 +4343,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
           Tensor `(top_paths, )` that contains
               the log probability of each decoded sequence.
   """
-  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
   input_length = math_ops.to_int32(input_length)
 
   if greedy:
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index e34f1b6926a8fd2c472664d330fe3cd9d714f021..27833e368d1e1c6938b83d08553fca9f4c3669a2 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -954,7 +954,6 @@ class BackendNNOpsTest(test.TestCase):
     x = keras.backend.variable(val)
     reduction_axes = (0, 2, 3)
 
-    # case: need broadcasting
     g_val = np.random.random((3,))
     b_val = np.random.random((3,))
     gamma = keras.backend.variable(g_val)
@@ -965,17 +964,6 @@ class BackendNNOpsTest(test.TestCase):
     self.assertEqual(mean.get_shape().as_list(), [3,])
     self.assertEqual(var.get_shape().as_list(), [3,])
 
-    # case: doesn't need broadcasting
-    g_val = np.random.random((1, 3, 1, 1))
-    b_val = np.random.random((1, 3, 1, 1))
-    gamma = keras.backend.variable(g_val)
-    beta = keras.backend.variable(b_val)
-    normed, mean, var = keras.backend.normalize_batch_in_training(
-        x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.get_shape().as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.get_shape().as_list(), [3,])
-    self.assertEqual(var.get_shape().as_list(), [3,])
-
     # case: gamma=None
     gamma = None
     normed, mean, var = keras.backend.normalize_batch_in_training(
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/_impl/keras/callbacks.py
index 8da3b857182237a47daa0f00a2340959a448160e..f0d9e0b0f522d344d9e04365f04655f00eea787f 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras callbacks: utilities called at certain points during model training.
+# pylint: disable=g-import-not-at-top
+"""Callbacks: utilities called at certain points during model training.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -36,12 +37,10 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
 
 
-# pylint: disable=g-import-not-at-top
 try:
   import requests
 except ImportError:
   requests = None
-# pylint: enable=g-import-not-at-top
 
 
 class CallbackList(object):
@@ -109,9 +108,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_begin)
     if (self._delta_t_batch > 0. and
         delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning(
-          'Method on_batch_begin() is slow compared '
-          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+      logging.warning('Method on_batch_begin() is slow compared '
+                      'to the batch update (%f). Check your callbacks.',
+                      delta_t_median)
     self._t_enter_batch = time.time()
 
   def on_batch_end(self, batch, logs=None):
@@ -132,9 +131,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_end)
     if (self._delta_t_batch > 0. and
         (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      logging.warning(
-          'Method on_batch_end() is slow compared '
-          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+      logging.warning('Method on_batch_end() is slow compared '
+                      'to the batch update (%f). Check your callbacks.',
+                      delta_t_median)
 
   def on_train_begin(self, logs=None):
     """Called at the beginning of training.
@@ -246,7 +245,8 @@ class BaseLogger(Callback):
 
 
 class TerminateOnNaN(Callback):
-  """Callback that terminates training when a NaN loss is encountered."""
+  """Callback that terminates training when a NaN loss is encountered.
+  """
 
   def __init__(self):
     super(TerminateOnNaN, self).__init__()
@@ -396,7 +396,7 @@ class ModelCheckpoint(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('ModelCheckpoint mode %s is unknown, '
-                      'fallback to auto mode.' % mode)
+                      'fallback to auto mode.', (mode), RuntimeWarning)
       mode = 'auto'
 
     if mode == 'min':
@@ -423,11 +423,11 @@ class ModelCheckpoint(Callback):
         current = logs.get(self.monitor)
         if current is None:
           logging.warning('Can save best model only with %s available, '
-                          'skipping.' % (self.monitor))
+                          'skipping.', self.monitor, RuntimeWarning)
         else:
           if self.monitor_op(current, self.best):
             if self.verbose > 0:
-              print('Epoch %05d: %s improved from %0.5f to %0.5f,'
+              print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
                     ' saving model to %s' % (epoch + 1, self.monitor, self.best,
                                              current, filepath))
             self.best = current
@@ -437,11 +437,11 @@ class ModelCheckpoint(Callback):
               self.model.save(filepath, overwrite=True)
           else:
             if self.verbose > 0:
-              print('Epoch %05d: %s did not improve' % (epoch + 1,
-                                                        self.monitor))
+              print('\nEpoch %05d: %s did not improve' % (epoch + 1,
+                                                          self.monitor))
       else:
         if self.verbose > 0:
-          print('Epoch %05d: saving model to %s' % (epoch + 1, filepath))
+          print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
         if self.save_weights_only:
           self.model.save_weights(filepath, overwrite=True)
         else:
@@ -486,7 +486,7 @@ class EarlyStopping(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('EarlyStopping mode %s is unknown, '
-                      'fallback to auto mode.' % mode)
+                      'fallback to auto mode.', mode, RuntimeWarning)
       mode = 'auto'
 
     if mode == 'min':
@@ -514,8 +514,8 @@ class EarlyStopping(Callback):
     current = logs.get(self.monitor)
     if current is None:
       logging.warning('Early stopping conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s' %
-                      (self.monitor, ','.join(list(logs.keys()))))
+                      'which is not available. Available metrics are: %s',
+                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
       return
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -544,8 +544,6 @@ class RemoteMonitor(Callback):
       path: String; path relative to `root` to which the events will be sent.
       field: String; JSON field under which the data will be stored.
       headers: Dictionary; optional custom HTTP headers.
-          Defaults to:
-          `{'Accept': 'application/json', 'Content-Type': 'application/json'}`
   """
 
   def __init__(self,
@@ -554,11 +552,7 @@ class RemoteMonitor(Callback):
                field='data',
                headers=None):
     super(RemoteMonitor, self).__init__()
-    if headers is None:
-      headers = {
-          'Accept': 'application/json',
-          'Content-Type': 'application/json'
-      }
+
     self.root = root
     self.path = path
     self.field = field
@@ -588,11 +582,13 @@ class LearningRateScheduler(Callback):
       schedule: a function that takes an epoch index as input
           (integer, indexed from 0) and returns a new
           learning rate as output (float).
+      verbose: int. 0: quiet, 1: update messages.
   """
 
-  def __init__(self, schedule):
+  def __init__(self, schedule, verbose=0):
     super(LearningRateScheduler, self).__init__()
     self.schedule = schedule
+    self.verbose = verbose
 
   def on_epoch_begin(self, epoch, logs=None):
     if not hasattr(self.model.optimizer, 'lr'):
@@ -602,6 +598,9 @@ class LearningRateScheduler(Callback):
       raise ValueError('The output of the "schedule" function '
                        'should be float.')
     K.set_value(self.model.optimizer.lr, lr)
+    if self.verbose > 0:
+      print('\nEpoch %05d: LearningRateScheduler reducing learning '
+            'rate to %s.' % (epoch + 1, lr))
 
 
 class TensorBoard(Callback):
@@ -842,7 +841,7 @@ class ReduceLROnPlateau(Callback):
     """
     if self.mode not in ['auto', 'min', 'max']:
       logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
-                      'fallback to auto mode.' % (self.mode))
+                      'fallback to auto mode.', self.mode, RuntimeWarning)
       self.mode = 'auto'
     if (self.mode == 'min' or
         (self.mode == 'auto' and 'acc' not in self.monitor)):
@@ -853,7 +852,6 @@ class ReduceLROnPlateau(Callback):
       self.best = -np.Inf
     self.cooldown_counter = 0
     self.wait = 0
-    self.lr_epsilon = self.min_lr * 1e-4
 
   def on_train_begin(self, logs=None):
     self._reset()
@@ -864,8 +862,9 @@ class ReduceLROnPlateau(Callback):
     current = logs.get(self.monitor)
     if current is None:
       logging.warning('Reduce LR on plateau conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s' %
-                      (self.monitor, ','.join(list(logs.keys()))))
+                      'which is not available. Available metrics are: %s',
+                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
+
     else:
       if self.in_cooldown():
         self.cooldown_counter -= 1
@@ -877,13 +876,13 @@ class ReduceLROnPlateau(Callback):
       elif not self.in_cooldown():
         if self.wait >= self.patience:
           old_lr = float(K.get_value(self.model.optimizer.lr))
-          if old_lr > self.min_lr + self.lr_epsilon:
+          if old_lr > self.min_lr:
             new_lr = old_lr * self.factor
             new_lr = max(new_lr, self.min_lr)
             K.set_value(self.model.optimizer.lr, new_lr)
             if self.verbose > 0:
-              print('\nEpoch %05d: reducing learning rate to %s.' % (epoch,
-                                                                     new_lr))
+              print('\nEpoch %05d: ReduceLROnPlateau reducing learning '
+                    'rate to %s.' % (epoch + 1, new_lr))
             self.cooldown_counter = self.cooldown
             self.wait = 0
         self.wait += 1
@@ -899,10 +898,11 @@ class CSVLogger(Callback):
   including 1D iterables such as np.ndarray.
 
   Example:
-      ```python
-      csv_logger = CSVLogger('training.log')
-      model.fit(X_train, Y_train, callbacks=[csv_logger])
-      ```
+
+  ```python
+  csv_logger = CSVLogger('training.log')
+  model.fit(X_train, Y_train, callbacks=[csv_logger])
+  ```
 
   Arguments:
       filename: filename of the csv file, e.g. 'run/log.csv'.
@@ -942,12 +942,14 @@ class CSVLogger(Callback):
       else:
         return k
 
+    if self.keys is None:
+      self.keys = sorted(logs.keys())
+
     if self.model.stop_training:
       # We set NA so that csv parsers do not fail for this last epoch.
       logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
 
     if not self.writer:
-      self.keys = sorted(logs.keys())
 
       class CustomDialect(csv.excel):
         delimiter = self.sep
@@ -993,32 +995,32 @@ class LambdaCallback(Callback):
 
   Example:
 
-      ```python
-      # Print the batch number at the beginning of every batch.
-      batch_print_callback = LambdaCallback(
-          on_batch_begin=lambda batch,logs: print(batch))
-
-      # Stream the epoch loss to a file in JSON format. The file content
-      # is not well-formed JSON but rather has a JSON object per line.
-      import json
-      json_log = open('loss_log.json', mode='wt', buffering=1)
-      json_logging_callback = LambdaCallback(
-          on_epoch_end=lambda epoch, logs: json_log.write(
-              json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
-          on_train_end=lambda logs: json_log.close()
-      )
-
-      # Terminate some processes after having finished model training.
-      processes = ...
-      cleanup_callback = LambdaCallback(
-          on_train_end=lambda logs: [
-              p.terminate() for p in processes if p.is_alive()])
-
-      model.fit(...,
-                callbacks=[batch_print_callback,
-                           json_logging_callback,
-                           cleanup_callback])
-      ```
+  ```python
+  # Print the batch number at the beginning of every batch.
+  batch_print_callback = LambdaCallback(
+      on_batch_begin=lambda batch,logs: print(batch))
+
+  # Stream the epoch loss to a file in JSON format. The file content
+  # is not well-formed JSON but rather has a JSON object per line.
+  import json
+  json_log = open('loss_log.json', mode='wt', buffering=1)
+  json_logging_callback = LambdaCallback(
+      on_epoch_end=lambda epoch, logs: json_log.write(
+          json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
+      on_train_end=lambda logs: json_log.close()
+  )
+
+  # Terminate some processes after having finished model training.
+  processes = ...
+  cleanup_callback = LambdaCallback(
+      on_train_end=lambda logs: [
+          p.terminate() for p in processes if p.is_alive()])
+
+  model.fit(...,
+            callbacks=[batch_print_callback,
+                       json_logging_callback,
+                       cleanup_callback])
+  ```
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index e58e3b0377b4b0fcad923095177c54d9c3ee1c0b..4b051c93f3aaca18e10df666966c66b4dce28111 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Constraints: functions that impose constraints on weights values.
+# pylint: disable=invalid-name
+"""Constraints: functions that impose constraints on weight values.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -54,10 +55,6 @@ class MaxNorm(Constraint):
           to constrain the weights of each filter tensor of size
           `(rows, cols, input_depth)`.
 
-  References:
-      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        Srivastava, Hinton, et al.
-        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
   """
 
   def __init__(self, max_value=2, axis=0):
@@ -79,7 +76,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    w *= K.cast(w >= 0., K.floatx())
+    w *= K.cast(K.greater_equal(w, 0.), K.floatx())
     return w
 
 
@@ -132,7 +129,7 @@ class MinMaxNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Conv2D` layer with `dim_ordering="channels_last"`,
+          In a `Conv2D` layer with `data_format="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
@@ -148,8 +145,9 @@ class MinMaxNorm(Constraint):
 
   def __call__(self, w):
     norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
-    desired = (self.rate * K.clip(norms, self.min_value, self.max_value) +
-               (1 - self.rate) * norms)
+    desired = (
+        self.rate * K.clip(norms, self.min_value, self.max_value) +
+        (1 - self.rate) * norms)
     w *= (desired / (K.epsilon() + norms))
     return w
 
@@ -164,13 +162,15 @@ class MinMaxNorm(Constraint):
 
 # Aliases.
 
-# pylint: disable=invalid-name
 max_norm = MaxNorm
 non_neg = NonNeg
 unit_norm = UnitNorm
 min_max_norm = MinMaxNorm
 
-# pylint: enable=invalid-name
+# Legacy aliases.
+maxnorm = max_norm
+nonneg = non_neg
+unitnorm = unit_norm
 
 
 def serialize(constraint):
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
index 0570e9bc0c7344641edf44cd5ef03a4f09005061..cfd7df61d5ea47b810776ac8da1bdfcff77d177f 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
@@ -21,29 +21,27 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.boston_housing.load_data')
-def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
+def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
   """Loads the Boston Housing dataset.
 
   Arguments:
       path: path where to cache the dataset locally
           (relative to ~/.keras/datasets).
+      test_split: fraction of the data to reserve as test set.
       seed: Random seed for shuffling the data
           before computing the test split.
-      test_split: fraction of the data to reserve as test set.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
   """
   assert 0 <= test_split < 1
-  fh = 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5'
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
-      file_hash=fh)
+      file_hash=
+      'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
   f = np.load(path)
   x = f['x']
   y = f['y']
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar.py b/tensorflow/python/keras/_impl/keras/datasets/cifar.py
index 564709c0eed6778b9809eb8c23556cac3c4702d9..7ada3340a59e114d73095068ec476da5973b67fb 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used by the CIFAR10 and CIFAR100 datasets.
+"""Utilities common to CIFAR10 and CIFAR100 datasets.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
index 1971f434b9af820af287a3848ef538f5163a2a9a..fb9d98d42cca9c98d6e9ea46782d1e3d31a4d7dc 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR10 small image classification dataset.
+"""CIFAR10 small images classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -25,10 +25,8 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.cifar10.load_data')
 def load_data():
   """Loads CIFAR10 dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
index f4039e935076a55baaf471ad544986082a4e4ad8..95aace599a721618eaae51c89f05de01653c682d 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR100 small image classification dataset.
+"""CIFAR100 small images classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -25,10 +25,8 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.cifar100.load_data')
 def load_data(label_mode='fine'):
   """Loads CIFAR100 dataset.
 
@@ -42,7 +40,7 @@ def load_data(label_mode='fine'):
       ValueError: in case of invalid `label_mode`.
   """
   if label_mode not in ['fine', 'coarse']:
-    raise ValueError('label_mode must be one of "fine" "coarse".')
+    raise ValueError('`label_mode` must be one of `"fine"`, `"coarse"`.')
 
   dirname = 'cifar-100-python'
   origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
diff --git a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
index 17be684e4f8bdb800c6b0883649da25f18fa0402..b9ae41a0d4d0e8d9df70e3fc1952e81c5f57e8d9 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import gzip
 import os
+
 import numpy as np
+
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 
 
@@ -38,9 +40,8 @@ def load_data():
   ]
 
   paths = []
-  for given_file in files:
-    paths.append(
-        get_file(given_file, origin=base + given_file, cache_subdir=dirname))
+  for fname in files:
+    paths.append(get_file(fname, origin=base + fname, cache_subdir=dirname))
 
   with gzip.open(paths[0], 'rb') as lbpath:
     y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
index 7946c46960ef15fdcaff6b5ad9f0bc2623a84b17..880c9c821b832caed4a3072c094d72a3171f7a63 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""IMDB movie review sentiment classification dataset.
+"""IMDB sentiment classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,13 +21,12 @@ from __future__ import print_function
 import json
 
 import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.platform import tf_logging as logging
 
 
-@tf_export('keras.datasets.imdb.load_data')
 def load_data(path='imdb.npz',
               num_words=None,
               skip_top=0,
@@ -35,7 +34,8 @@ def load_data(path='imdb.npz',
               seed=113,
               start_char=1,
               oov_char=2,
-              index_from=3):
+              index_from=3,
+              **kwargs):
   """Loads the IMDB dataset.
 
   Arguments:
@@ -52,6 +52,7 @@ def load_data(path='imdb.npz',
       oov_char: words that were cut out because of the `num_words`
           or `skip_top` limit will be replaced with this character.
       index_from: index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -66,14 +67,21 @@ def load_data(path='imdb.npz',
   Words that were not seen in the training set but are in the test set
   have simply been skipped.
   """
+  # Legacy support
+  if 'nb_words' in kwargs:
+    logging.warning('The `nb_words` argument in `load_data` '
+                    'has been renamed `num_words`.')
+    num_words = kwargs.pop('nb_words')
+  if kwargs:
+    raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
       file_hash='599dadb1135973df5b59232a0e9a887c')
-  f = np.load(path)
-  x_train, labels_train = f['x_train'], f['y_train']
-  x_test, labels_test = f['x_test'], f['y_test']
-  f.close()
+  with np.load(path) as f:
+    x_train, labels_train = f['x_train'], f['y_train']
+    x_test, labels_test = f['x_test'], f['y_test']
 
   np.random.seed(seed)
   indices = np.arange(len(x_train))
@@ -95,14 +103,7 @@ def load_data(path='imdb.npz',
     xs = [[w + index_from for w in x] for x in xs]
 
   if maxlen:
-    new_xs = []
-    new_labels = []
-    for x, y in zip(xs, labels):
-      if len(x) < maxlen:
-        new_xs.append(x)
-        new_labels.append(y)
-    xs = new_xs
-    labels = new_labels
+    xs, labels = _remove_long_seq(maxlen, xs, labels)
     if not xs:
       raise ValueError('After filtering for sequences shorter than maxlen=' +
                        str(maxlen) + ', no sequence was kept. '
@@ -114,28 +115,19 @@ def load_data(path='imdb.npz',
   # reserve 'index_from' (=3 by default) characters:
   # 0 (padding), 1 (start), 2 (OOV)
   if oov_char is not None:
-    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
-          for x in xs]
+    xs = [
+        [w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs
+    ]
   else:
-    new_xs = []
-    for x in xs:
-      nx = []
-      for w in x:
-        if skip_top <= w < num_words:
-          nx.append(w)
-      new_xs.append(nx)
-    xs = new_xs
-
-  x_train = np.array(xs[:len(x_train)])
-  y_train = np.array(labels[:len(x_train)])
+    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
 
-  x_test = np.array(xs[len(x_train):])
-  y_test = np.array(labels[len(x_train):])
+  idx = len(x_train)
+  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
+  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
 
   return (x_train, y_train), (x_test, y_test)
 
 
-@tf_export('keras.datasets.imdb.get_word_index')
 def get_word_index(path='imdb_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
@@ -147,7 +139,8 @@ def get_word_index(path='imdb_word_index.json'):
   """
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json')
+      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
+      file_hash='bfafd718b763782e994055a2d397834f')
   f = open(path)
   data = json.load(f)
   f.close()
diff --git a/tensorflow/python/keras/_impl/keras/datasets/mnist.py b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
index e9f53480150034d3e83f85cfad67f63e61422f3e..ec12a31dcf07cad6ec076aad9bd1f671236cf457 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""MNIST handwritten digits classification dataset.
+"""MNIST handwritten digits dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,10 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.mnist.load_data')
 def load_data(path='mnist.npz'):
   """Loads the MNIST dataset.
 
@@ -40,9 +38,7 @@ def load_data(path='mnist.npz'):
       origin='https://s3.amazonaws.com/img-datasets/mnist.npz',
       file_hash='8a61469f7ea1b51cbae51d4f78837e45')
   f = np.load(path)
-  x_train = f['x_train']
-  y_train = f['y_train']
-  x_test = f['x_test']
-  y_test = f['y_test']
+  x_train, y_train = f['x_train'], f['y_train']
+  x_test, y_test = f['x_test'], f['y_test']
   f.close()
   return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/reuters.py b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
index 6da5aa4b5eb8b8eb5dcd8c75c3f1f86340436601..95cf8852a9c6b1866573231a833f9a95af7b0d55 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Reuters newswire topic classification dataset.
+"""Reuters topic classification dataset.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -22,13 +21,12 @@ from __future__ import print_function
 import json
 
 import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.platform import tf_logging as logging
 
 
-@tf_export('keras.datasets.reuters.load_data')
 def load_data(path='reuters.npz',
               num_words=None,
               skip_top=0,
@@ -37,7 +35,8 @@ def load_data(path='reuters.npz',
               seed=113,
               start_char=1,
               oov_char=2,
-              index_from=3):
+              index_from=3,
+              **kwargs):
   """Loads the Reuters newswire classification dataset.
 
   Arguments:
@@ -55,6 +54,7 @@ def load_data(path='reuters.npz',
       oov_char: words that were cut out because of the `num_words`
           or `skip_top` limit will be replaced with this character.
       index_from: index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -65,14 +65,20 @@ def load_data(path='reuters.npz',
   Words that were not seen in the training set but are in the test set
   have simply been skipped.
   """
+  # Legacy support
+  if 'nb_words' in kwargs:
+    logging.warning('The `nb_words` argument in `load_data` '
+                    'has been renamed `num_words`.')
+    num_words = kwargs.pop('nb_words')
+  if kwargs:
+    raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/text-datasets/reuters.npz',
       file_hash='87aedbeb0cb229e378797a632c1997b6')
-  npzfile = np.load(path)
-  xs = npzfile['x']
-  labels = npzfile['y']
-  npzfile.close()
+  with np.load(path) as f:
+    xs, labels = f['x'], f['y']
 
   np.random.seed(seed)
   indices = np.arange(len(xs))
@@ -80,22 +86,13 @@ def load_data(path='reuters.npz',
   xs = xs[indices]
   labels = labels[indices]
 
-  np.random.shuffle(labels)
-
   if start_char is not None:
     xs = [[start_char] + [w + index_from for w in x] for x in xs]
   elif index_from:
     xs = [[w + index_from for w in x] for x in xs]
 
   if maxlen:
-    new_xs = []
-    new_labels = []
-    for x, y in zip(xs, labels):
-      if len(x) < maxlen:
-        new_xs.append(x)
-        new_labels.append(y)
-    xs = new_xs
-    labels = new_labels
+    xs, labels = _remove_long_seq(maxlen, xs, labels)
 
   if not num_words:
     num_words = max([max(x) for x in xs])
@@ -104,28 +101,17 @@ def load_data(path='reuters.npz',
   # reserve 'index_from' (=3 by default) characters:
   # 0 (padding), 1 (start), 2 (OOV)
   if oov_char is not None:
-    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
-          for x in xs]
+    xs = [[w if skip_top <= w < num_words else oov_char for w in x] for x in xs]
   else:
-    new_xs = []
-    for x in xs:
-      nx = []
-      for w in x:
-        if skip_top <= w < num_words:
-          nx.append(w)
-      new_xs.append(nx)
-    xs = new_xs
-
-  x_train = np.array(xs[:int(len(xs) * (1 - test_split))])
-  y_train = np.array(labels[:int(len(xs) * (1 - test_split))])
+    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
 
-  x_test = np.array(xs[int(len(xs) * (1 - test_split)):])
-  y_test = np.array(labels[int(len(xs) * (1 - test_split)):])
+  idx = int(len(xs) * (1 - test_split))
+  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
+  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
 
   return (x_train, y_train), (x_test, y_test)
 
 
-@tf_export('keras.datasets.reuters.get_word_index')
 def get_word_index(path='reuters_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index d6e0be8e432eb535a053a4c09fda35a32f6c70f3..8354a2b8fd7c0182c1daaa7e8fa8390da6038d0b 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -27,6 +27,7 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
@@ -707,13 +708,15 @@ class Network(tf_network.GraphNetwork, Layer):
       self.input_names.append(layer.name)
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
-        self._feed_inputs.append(layer.input)
         self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
+        # layer.input gives an error in eager mode
+        if context.in_graph_mode():
+          self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
-    self.internal_input_shapes = [K.int_shape(x) for x in self.inputs]
-    self.internal_output_shapes = [K.int_shape(x) for x in self.outputs]
+    self._internal_input_shapes = [K.int_shape(x) for x in self.inputs]
+    self._internal_output_shapes = [K.int_shape(x) for x in self.outputs]
 
   @property
   def uses_learning_phase(self):
@@ -1303,18 +1306,17 @@ def preprocess_weights_for_loading(layer,
   Returns:
       A list of weights values (Numpy arrays).
   """
-  if original_keras_version == '1':
-    if layer.__class__.__name__ == 'Bidirectional':
-      num_weights_per_layer = len(weights) // 2
-
-      forward_weights = preprocess_weights_for_loading(
-          layer.forward_layer, weights[:num_weights_per_layer],
-          original_keras_version, original_backend)
-      backward_weights = preprocess_weights_for_loading(
-          layer.backward_layer, weights[num_weights_per_layer:],
-          original_keras_version, original_backend)
-      weights = forward_weights + backward_weights
+  if layer.__class__.__name__ == 'Bidirectional':
+    num_weights_per_layer = len(weights) // 2
+    forward_weights = preprocess_weights_for_loading(
+        layer.forward_layer, weights[:num_weights_per_layer],
+        original_keras_version, original_backend)
+    backward_weights = preprocess_weights_for_loading(
+        layer.backward_layer, weights[num_weights_per_layer:],
+        original_keras_version, original_backend)
+    weights = forward_weights + backward_weights
 
+  if original_keras_version == '1':
     if layer.__class__.__name__ == 'TimeDistributed':
       weights = preprocess_weights_for_loading(
           layer.layer, weights, original_keras_version, original_backend)
@@ -1418,7 +1420,7 @@ def preprocess_weights_for_loading(layer,
 
   conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
   if layer.__class__.__name__ in conv_layers:
-    if original_backend and K.backend() != original_backend:
+    if original_backend == 'theano':
       weights[0] = conv_utils.convert_kernel(weights[0])
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = conv_utils.convert_kernel(weights[1])
@@ -1427,10 +1429,9 @@ def preprocess_weights_for_loading(layer,
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
 
-  # convert the weights of CuDNNLSTM so that they could be loaded into LSTM
+  # Convert the weights of CuDNNLSTM so that they could be loaded into LSTM
   if layer.__class__.__name__ == 'LSTM' and len(weights) == 3:
-    # determine if we're loading a CuDNNLSTM layer from the number of bias
-    # weights:
+    # Determine if loading a CuDNNLSTM layer from the number of bias weights:
     # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
     # if there's no bias weight in the file, skip this conversion
     units = weights[1].shape[0]
@@ -1572,3 +1573,31 @@ def load_weights_from_hdf5_group_by_name(f, layers):
       for i in range(len(weight_values)):
         weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
   K.batch_set_value(weight_value_tuples)
+
+
+def shape_type_conversion(fn):
+  """Decorator that handles tuple/TensorShape conversion.
+
+  Used in `compute_output_shape` and `build`.
+
+  Arguments:
+    fn: function to wrap.
+
+  Returns:
+    Wrapped function.
+  """
+
+  def wrapper(instance, input_shape):
+    if input_shape is not None:
+      if isinstance(input_shape, list):
+        input_shape = [
+            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
+      else:
+        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    output_shape = fn(instance, input_shape)
+    if output_shape is not None:
+      if isinstance(output_shape, list):
+        return [tensor_shape.TensorShape(x) for x in output_shape]
+      return tensor_shape.TensorShape(output_shape)
+
+  return wrapper
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index debea2503ee2e440000847c0ce92185e3d230138..43d95b1f194e1d30cbdae726fdf5979bd7065d25 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras training and evaluation routines.
+"""Training-related part of the Keras engine.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -23,17 +22,26 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras import optimizers
+from tensorflow.python.keras._impl.keras.engine import training_eager
 from tensorflow.python.keras._impl.keras.engine.topology import Network
 from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import optimizer as tf_optimizer_module
+
+try:
+  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
+except ImportError:
+  issparse = None
 
 
 def _standardize_input_data(data,
@@ -70,89 +78,75 @@ def _standardize_input_data(data,
     return []
   if data is None:
     return [None for _ in range(len(names))]
+
   if isinstance(data, dict):
-    for key, value in data.items():
-      if value.__class__.__name__ == 'DataFrame':
-        data[key] = value.values
-    arrays = []
-    for name in names:
-      if name not in data:
-        raise ValueError('No data provided for "' + name +
-                         '". Need data for each key in: ' + str(names))
-      arrays.append(data[name])
+    try:
+      data = [
+          data[x].values
+          if data[x].__class__.__name__ == 'DataFrame' else data[x]
+          for x in names
+      ]
+    except KeyError as e:
+      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
+                       'for each key in: ' + str(names))
   elif isinstance(data, list):
-    for key, value in enumerate(data):
-      if value.__class__.__name__ == 'DataFrame':
-        data[key] = value.values
-    if len(data) != len(names):
-      if data and hasattr(data[0], 'shape'):
-        raise ValueError(
-            'Error when checking model ' + exception_prefix +
-            ': the list of Numpy arrays '
-            'that you are passing to your model '
-            'is not the size the model expected. '
-            'Expected to see ' + str(len(names)) + ' array(s), but instead got '
-            'the following list of ' + str(len(data)) + ' arrays: ' +
-            str(data)[:200] + '...')
-      else:
-        if len(names) == 1:
-          data = [np.asarray(data)]
-        else:
-          raise ValueError('Error when checking model ' + exception_prefix +
-                           ': you are passing a list as '
-                           'input to your model, '
-                           'but the model expects '
-                           'a list of ' + str(len(names)) +
-                           ' Numpy arrays instead. '
-                           'The list you passed was: ' + str(data)[:200])
-    arrays = data
-  elif data.__class__.__name__ == 'DataFrame':
-    # test if data is a DataFrame, without pandas installed
-    arrays = data.values
+    if isinstance(data[0], list):
+      data = [np.asarray(d) for d in data]
+    elif len(names) == 1 and isinstance(data[0], (float, int)):
+      data = [np.asarray(data)]
+    else:
+      data = [
+          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
+      ]
   else:
-    if not hasattr(data, 'shape'):
+    data = data.values if data.__class__.__name__ == 'DataFrame' else data
+    data = [data]
+  data = [
+      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
+  ]
+
+  if len(data) != len(names):
+    if data and hasattr(data[0], 'shape'):
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': the list of Numpy arrays that you are passing to '
+                       'your model is not the size the model expected. '
+                       'Expected to see ' + str(len(names)) + ' array(s), '
+                       'but instead got the following list of ' +
+                       str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
+    elif len(names) > 1:
+      raise ValueError(
+          'Error when checking model ' + exception_prefix +
+          ': you are passing a list as input to your model, '
+          'but the model expects a list of ' + str(len(names)) +
+          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
+    elif len(data) == 1 and not hasattr(data[0], 'shape'):
       raise TypeError('Error when checking model ' + exception_prefix +
-                      ': data should be a Numpy array, '
-                      'or list/dict of Numpy arrays. '
-                      'Found: ' + str(data)[:200] + '...')
-    if len(names) > 1:
-      # Case: model expects multiple inputs but only received
-      # a single Numpy array.
-      raise ValueError('The model expects ' + str(len(names)) + ' ' +
-                       exception_prefix +
-                       ' arrays, but only received one array. '
-                       'Found: array with shape ' + str(data.shape))
-    arrays = [data]
-
-  # Make arrays at least 2D.
-  for i in range(len(names)):
-    array = arrays[i]
-    if len(array.shape) == 1:
-      array = np.expand_dims(array, 1)
-      arrays[i] = array
+                      ': data should be a Numpy array, or list/dict of '
+                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
+    elif len(names) == 1:
+      data = [np.asarray(data)]
 
   # Check shapes compatibility.
   if shapes:
     for i in range(len(names)):
-      if shapes[i] is None:
-        continue
-      array = arrays[i]
-      if len(array.shape) != len(shapes[i]):
-        raise ValueError(
-            'Error when checking ' + exception_prefix + ': expected ' + names[i]
-            + ' to have ' + str(len(shapes[i])) +
-            ' dimensions, but got array with shape ' + str(array.shape))
-      for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
-        if not j and not check_batch_axis:
-          # skip the first axis
-          continue
-        if ref_dim:
-          if ref_dim != dim:
-            raise ValueError('Error when checking ' + exception_prefix +
-                             ': expected ' + names[i] + ' to have shape ' +
-                             str(shapes[i]) + ' but got array with shape ' +
-                             str(array.shape))
-  return arrays
+      if shapes[i] is not None:
+        data_shape = data[i].shape
+        shape = shapes[i]
+        if data[i].ndim != len(shape):
+          raise ValueError('Error when checking ' + exception_prefix +
+                           ': expected ' + names[i] + ' to have ' +
+                           str(len(shape)) + ' dimensions, but got array '
+                           'with shape ' + str(data_shape))
+        if not check_batch_axis:
+          data_shape = data_shape[1:]
+          shape = shape[1:]
+        for dim, ref_dim in zip(data_shape, shape):
+          if ref_dim != dim and ref_dim:
+            raise ValueError(
+                'Error when checking ' + exception_prefix + ': expected ' +
+                names[i] + ' to have shape ' + str(shape) +
+                ' but got array with shape ' + str(data_shape))
+  return data
 
 
 def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
@@ -193,10 +187,10 @@ def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
       x_weights.append(x_weight.get(name))
     return x_weights
   else:
-    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
-                    'should be either a list or a dict. '
-                    'Provided `' + weight_type + '` type not understood: ' +
-                    str(x_weight))
+    raise TypeError(
+        'The model has multiple outputs, so `' + weight_type + '` '
+        'should be either a list or a dict. '
+        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
 
 
 def _standardize_class_weights(class_weight, output_names):
@@ -234,12 +228,12 @@ def _check_array_lengths(inputs, targets, weights=None):
   set_w = set_of_lengths(weights)
   if len(set_x) > 1:
     raise ValueError('All input arrays (x) should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [x.shape for x in inputs]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([x.shape for x in inputs]))
   if len(set_y) > 1:
     raise ValueError('All target arrays (y) should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [y.shape for y in targets]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([y.shape for y in targets]))
   if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
     raise ValueError('Input arrays should have '
                      'the same number of samples as target arrays. '
@@ -247,8 +241,8 @@ def _check_array_lengths(inputs, targets, weights=None):
                      'and ' + str(list(set_y)[0]) + ' target samples.')
   if len(set_w) > 1:
     raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [w.shape for w in weights]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([w.shape for w in weights]))
   if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
     raise ValueError('Sample_weight arrays should have '
                      'the same number of samples as target arrays. Got ' +
@@ -528,16 +522,16 @@ def _standardize_weights(y,
 
   if sample_weight is not None:
     if len(sample_weight.shape) > len(y.shape):
-      raise ValueError('Found a sample_weight with shape' +
-                       str(sample_weight.shape) + '.'
-                       'Expected sample_weight with rank '
-                       'less than or equal to ' + str(len(y.shape)))
+      raise ValueError(
+          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
+          'Expected sample_weight with rank '
+          'less than or equal to ' + str(len(y.shape)))
 
     if y.shape[:sample_weight.ndim] != sample_weight.shape:
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + ' for an input with shape ' +
-                       str(y.shape) + '. '
-                       'sample_weight cannot be broadcast.')
+      raise ValueError(
+          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
+          ' for an input with shape ' + str(y.shape) + '. '
+          'sample_weight cannot be broadcast.')
     return sample_weight
   elif isinstance(class_weight, dict):
     if len(y.shape) > 2:
@@ -631,20 +625,25 @@ class Model(Network):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     loss = loss or {}
+    if context.in_eager_mode() and  not isinstance(
+        optimizer, tf_optimizer_module.Optimizer):
+      raise ValueError('Only TF native optimizers are supported in Eager mode.')
+
     self.optimizer = optimizers.get(optimizer)
-    self.sample_weight_mode = sample_weight_mode
     self.loss = loss
     self.loss_weights = loss_weights
+    if context.in_eager_mode() and sample_weight_mode is not None:
+      raise ValueError('sample_weight_mode is not supported in Eager mode.')
     self.sample_weight_mode = sample_weight_mode
 
     # Prepare loss functions.
     if isinstance(loss, dict):
       for name in loss:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in loss '
-                           'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in loss '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       loss_functions = []
       for name in self.output_names:
         if name not in loss:
@@ -657,7 +656,7 @@ class Model(Network):
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
         raise ValueError('When passing a list as loss, '
-                         'it should have one entry per model output. '
+                         'it should have one entry per model outputs. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed loss=' + str(loss))
       loss_functions = [losses.get(l) for l in loss]
@@ -665,6 +664,7 @@ class Model(Network):
       loss_function = losses.get(loss)
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
+
     weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions]
     skip_target_indices = []
     skip_target_weighing_indices = []
@@ -678,11 +678,12 @@ class Model(Network):
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    masks = self.compute_mask(self.inputs, mask=None)
-    if masks is None:
-      masks = [None for _ in self.outputs]
-    if not isinstance(masks, list):
-      masks = [masks]
+    if context.in_graph_mode():
+      masks = self.compute_mask(self.inputs, mask=None)
+      if masks is None:
+        masks = [None for _ in self.outputs]
+      if not isinstance(masks, list):
+        masks = [masks]
 
     # Prepare loss weights.
     if loss_weights is None:
@@ -690,24 +691,50 @@ class Model(Network):
     elif isinstance(loss_weights, dict):
       for name in loss_weights:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in loss_weights '
-                           'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in loss_weights '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       loss_weights_list = []
       for name in self.output_names:
         loss_weights_list.append(loss_weights.get(name, 1.))
     elif isinstance(loss_weights, list):
       if len(loss_weights) != len(self.outputs):
-        raise ValueError('When passing a list as loss_weights, '
-                         'it should have one entry per model output. '
-                         'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss_weights=' +
-                         str(loss_weights))
+        raise ValueError(
+            'When passing a list as loss_weights, '
+            'it should have one entry per model output. '
+            'The model has ' + str(len(self.outputs)) +
+            ' outputs, but you passed loss_weights=' + str(loss_weights))
       loss_weights_list = loss_weights
     else:
       raise TypeError('Could not interpret loss_weights argument: ' +
                       str(loss_weights) + ' - expected a list of dicts.')
+    self.loss_weights_list = loss_weights_list
+
+    # initialization for Eager mode execution
+    if context.in_eager_mode():
+      if target_tensors is not None:
+        raise ValueError('target_tensors are not currently supported in Eager'
+                         'mode.')
+      self.total_loss = None
+      self.metrics = metrics
+      self.weighted_metrics = weighted_metrics
+      self.metrics_tensors = []
+      self.metrics_names = ['loss']
+      for i in range(len(self.outputs)):
+        if len(self.outputs) > 1:
+          self.metrics_names.append(self.output_names[i] + '_loss')
+      self.nested_metrics = _collect_metrics(metrics, self.output_names)
+      self._feed_sample_weight_modes = []
+      for i in range(len(self.outputs)):
+        self._feed_sample_weight_modes.append(None)
+      self.sample_weights = []
+      self.targets = []
+      self._collected_trainable_weights = self.trainable_weights
+      for i in range(len(self.outputs)):
+        self._feed_output_names.append(self.output_names[i])
+
+      return
 
     # Prepare targets of model.
     self.targets = []
@@ -715,30 +742,31 @@ class Model(Network):
     if target_tensors is not None:
       if isinstance(target_tensors, list):
         if len(target_tensors) != len(self.outputs):
-          raise ValueError('When passing a list as `target_tensors`, '
-                           'it should have one entry per model output. '
-                           'The model has ' + str(len(self.outputs)) +
-                           ' outputs, but you passed target_tensors=' +
-                           str(target_tensors))
+          raise ValueError(
+              'When passing a list as `target_tensors`, '
+              'it should have one entry per model output. '
+              'The model has ' + str(len(self.outputs)) +
+              ' outputs, but you passed target_tensors=' + str(target_tensors))
       elif isinstance(target_tensors, dict):
         for name in target_tensors:
           if name not in self.output_names:
-            raise ValueError('Unknown entry in `target_tensors` '
-                             'dictionary: "' + name + '". '
-                             'Only expected the following keys: ' +
-                             str(self.output_names))
-        target_tensors_ = []
+            raise ValueError(
+                'Unknown entry in `target_tensors` '
+                'dictionary: "' + name + '". '
+                'Only expected the following keys: ' + str(self.output_names))
+        tmp_target_tensors = []
         for name in self.output_names:
-          target_tensors_.append(target_tensors.get(name, None))
-        target_tensors = target_tensors_
+          tmp_target_tensors.append(target_tensors.get(name, None))
+        target_tensors = tmp_target_tensors
       else:
         raise TypeError('Expected `target_tensors` to be '
                         'a list or dict, but got:', target_tensors)
+
     for i in range(len(self.outputs)):
       if i in skip_target_indices:
         self.targets.append(None)
       else:
-        shape = self.internal_output_shapes[i]
+        shape = self._internal_output_shapes[i]
         name = self.output_names[i]
         if target_tensors is not None:
           target = target_tensors[i]
@@ -766,24 +794,24 @@ class Model(Network):
     if isinstance(sample_weight_mode, dict):
       for name in sample_weight_mode:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in '
-                           'sample_weight_mode dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in '
+              'sample_weight_mode dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       for i, name in enumerate(self.output_names):
         if i in skip_target_weighing_indices:
           weight = None
           sample_weight_modes.append(None)
         else:
           if name not in sample_weight_mode:
-            raise ValueError('Output "' + name +
-                             '" missing from sample_weight_modes '
-                             'dictionary')
+            raise ValueError(
+                'Output "' + name + '" missing from sample_weight_modes '
+                'dictionary')
           if sample_weight_mode.get(name) == 'temporal':
             weight = K.placeholder(ndim=2, name=name + '_sample_weights')
             sample_weight_modes.append('temporal')
           else:
-            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            weight = K.placeholder(ndim=1, name=name + 'sample_weights')
             sample_weight_modes.append(None)
         sample_weights.append(weight)
     elif isinstance(sample_weight_mode, list):
@@ -894,23 +922,36 @@ class Model(Network):
           metric_name_prefix = 'weighted_' if weights is not None else ''
 
           for metric in metrics:
-            if metric == 'accuracy' or metric == 'acc':
-              # custom handling of accuracy
+            if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+              # custom handling of accuracy/crossentropy
               # (because of class mode duality)
-              output_shape = self.internal_output_shapes[i]
+              output_shape = self._internal_output_shapes[i]
               if (output_shape[-1] == 1 or
                   self.loss_functions[i] == losses.binary_crossentropy):
-                # case: binary accuracy
-                acc_fn = metrics_module.binary_accuracy
+                # case: binary accuracy/crossentropy
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.binary_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.binary_crossentropy
               elif self.loss_functions[
                   i] == losses.sparse_categorical_crossentropy:
-                # case: categorical accuracy with sparse targets
-                acc_fn = metrics_module.sparse_categorical_accuracy
+                # case: categorical accuracy/crossentropy with sparse targets
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.sparse_categorical_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.sparse_categorical_crossentropy
               else:
-                acc_fn = metrics_module.categorical_accuracy
-
+                # case: categorical accuracy/crossentropy
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.categorical_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.categorical_crossentropy
+              if metric in ('accuracy', 'acc'):
+                suffix = 'acc'
+              elif metric in ('crossentropy', 'ce'):
+                suffix = 'ce'
               weighted_metric_fn = _weighted_masked_objective(acc_fn)
-              metric_name = metric_name_prefix + 'acc'
+              metric_name = metric_name_prefix + suffix
             else:
               metric_fn = metrics_module.get(metric)
               weighted_metric_fn = _weighted_masked_objective(metric_fn)
@@ -930,7 +971,7 @@ class Model(Network):
     self._feed_sample_weights = []
     for i in range(len(self.sample_weights)):
       if i not in skip_target_weighing_indices:
-        self._feed_sample_weights.append(sample_weights[i])
+        self._feed_sample_weights.append(self.sample_weights[i])
 
     # Functions for train, test and predict will
     # be compiled lazily when required.
@@ -949,7 +990,7 @@ class Model(Network):
     """Check trainable weights count consistency.
 
     This will raise a warning if `trainable_weights` and
-    `_collected_trainable_weights` are consistent (i.e. have the same
+    `_collected_trainable_weights` are inconsistent (i.e. have different
     number of parameters).
     Inconsistency will typically arise when one modifies `model.trainable`
     without calling `model.compile` again.
@@ -959,9 +1000,10 @@ class Model(Network):
 
     if len(self.trainable_weights) != len(self._collected_trainable_weights):
       logging.warning(
-          'Discrepancy between trainable weights and collected trainable'
-          ' weights, did you set `model.trainable` without calling'
-          ' `model.compile` after ?')
+          UserWarning(
+              'Discrepancy between trainable weights and collected trainable'
+              ' weights, did you set `model.trainable` without calling'
+              ' `model.compile` after ?'))
 
   def _make_train_function(self):
     if not hasattr(self, 'train_function'):
@@ -978,6 +1020,7 @@ class Model(Network):
         with K.name_scope(self.optimizer.__class__.__name__):
           training_updates = self.optimizer.get_updates(
               params=self._collected_trainable_weights, loss=self.total_loss)
+
         updates = self.updates + training_updates
         # Gets loss and metrics. Updates weights at each call.
         self.train_function = K.function(
@@ -1050,18 +1093,21 @@ class Model(Network):
         processed based on the size of the first dimension of the
         first input numpy array. When steps is not `None` and
         `batch_size` is `None`, returns `None`.
+
+    Raises:
+        ValueError: In case of invalid arguments.
     """
     if steps is not None:
       num_samples = None
       if batch_size is not None:
-        raise ValueError('If ' + steps_name +
-                         ' is set, the `batch_size` must be None.')
+        raise ValueError(
+            'If ' + steps_name + ' is set, the `batch_size` must be None.')
     elif ins and hasattr(ins[0], 'shape'):
       num_samples = ins[0].shape[0]
     else:
-      raise ValueError('Either the input data should have '
-                       'a defined shape, or ' + steps_name +
-                       ' should be specified.')
+      raise ValueError(
+          'Either the input data should have '
+          'a defined shape, or ' + steps_name + ' should be specified.')
     return num_samples
 
   def _fit_loop(self,
@@ -1104,31 +1150,33 @@ class Model(Network):
         steps_per_epoch: Total number of steps (batches of samples)
             before declaring one epoch finished and starting the
             next epoch. Ignored with the default value of `None`.
-        validation_steps: Number of steps to run validation for (only if doing
-          validation from data tensors). Ignored with default value of `None`.
+        validation_steps: Number of steps to run validation for
+            (only if doing validation from data tensors).
+            Ignored with the default value of `None`.
 
     Returns:
         `History` object.
 
     Raises:
-      ValueError: In case of invalid argument values.
+        ValueError: in case of invalid arguments.
     """
     do_validation = False
     if val_f and val_ins:
       do_validation = True
-      if (verbose and ins and
-          hasattr(ins[0], 'shape') and hasattr(val_ins[0], 'shape')):
+      if verbose and ins and hasattr(ins[0], 'shape') and hasattr(
+          val_ins[0], 'shape'):
         print('Train on %d samples, validate on %d samples' %
               (ins[0].shape[0], val_ins[0].shape[0]))
     if validation_steps:
-      if steps_per_epoch is None:
-        raise ValueError('Can only use `validation_steps` when doing step-wise '
-                         'training, i.e. `steps_per_epoch` must be set.')
       do_validation = True
+      if steps_per_epoch is None:
+        raise ValueError('Can only use `validation_steps` '
+                         'when doing step-wise '
+                         'training, i.e. `steps_per_epoch` '
+                         'must be set.')
 
     num_train_samples = self._check_num_samples(
         ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-
     if num_train_samples is not None:
       index_array = np.arange(num_train_samples)
 
@@ -1151,6 +1199,7 @@ class Model(Network):
       callback_model = self
 
     callbacks.set_model(callback_model)
+
     callbacks.set_params({
         'batch_size': batch_size,
         'epochs': epochs,
@@ -1165,6 +1214,13 @@ class Model(Network):
     for cbk in callbacks:
       cbk.validation_data = val_ins
 
+    # To prevent a slowdown, we find beforehand the arrays that need conversion.
+    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
+    indices_for_conversion_to_dense = []
+    for i in range(len(feed)):
+      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+        indices_for_conversion_to_dense.append(i)
+
     for epoch in range(initial_epoch, epochs):
       callbacks.on_epoch_begin(epoch)
       epoch_logs = {}
@@ -1204,6 +1260,7 @@ class Model(Network):
           np.random.shuffle(index_array)
 
         batches = _make_batches(num_train_samples, batch_size)
+
         for batch_index, (batch_start, batch_end) in enumerate(batches):
           batch_ids = index_array[batch_start:batch_end]
           try:
@@ -1220,6 +1277,9 @@ class Model(Network):
           batch_logs['batch'] = batch_index
           batch_logs['size'] = len(batch_ids)
           callbacks.on_batch_begin(batch_index, batch_logs)
+          for i in indices_for_conversion_to_dense:
+            ins_batch[i] = ins_batch[i].toarray()
+
           outs = f(ins_batch)
           if not isinstance(outs, list):
             outs = [outs]
@@ -1268,6 +1328,13 @@ class Model(Network):
         progbar = Progbar(target=steps)
       else:
         progbar = Progbar(target=num_samples)
+
+    indices_for_conversion_to_dense = []
+    for i in range(len(self._feed_inputs)):
+      if (issparse is not None and issparse(ins[i]) and
+          not K.is_sparse(self._feed_inputs[i])):
+        indices_for_conversion_to_dense.append(i)
+
     if steps is not None:
       # Step-based predictions.
       # Since we do not know how many samples
@@ -1305,6 +1372,9 @@ class Model(Network):
           ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
         else:
           ins_batch = _slice_arrays(ins, batch_ids)
+        for i in indices_for_conversion_to_dense:
+          ins_batch[i] = ins_batch[i].toarray()
+
         batch_outs = f(ins_batch)
         if not isinstance(batch_outs, list):
           batch_outs = [batch_outs]
@@ -1341,12 +1411,19 @@ class Model(Network):
     """
     num_samples = self._check_num_samples(ins, batch_size, steps, 'steps')
     outs = []
-
     if verbose == 1:
       if steps is not None:
         progbar = Progbar(target=steps)
       else:
         progbar = Progbar(target=num_samples)
+
+    # To prevent a slowdown, we find beforehand the arrays that need conversion.
+    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
+    indices_for_conversion_to_dense = []
+    for i in range(len(feed)):
+      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+        indices_for_conversion_to_dense.append(i)
+
     if steps is not None:
       for step in range(steps):
         batch_outs = f(ins)
@@ -1365,8 +1442,6 @@ class Model(Network):
       for i in range(len(outs)):
         outs[i] /= steps
     else:
-      if verbose == 1:
-        progbar = Progbar(target=num_samples)
       batches = _make_batches(num_samples, batch_size)
       index_array = np.arange(num_samples)
       for batch_index, (batch_start, batch_end) in enumerate(batches):
@@ -1376,8 +1451,11 @@ class Model(Network):
           ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
         else:
           ins_batch = _slice_arrays(ins, batch_ids)
+        for i in indices_for_conversion_to_dense:
+          ins_batch[i] = ins_batch[i].toarray()
 
         batch_outs = f(ins_batch)
+
         if isinstance(batch_outs, list):
           if batch_index == 0:
             for batch_out in enumerate(batch_outs):
@@ -1388,7 +1466,6 @@ class Model(Network):
           if batch_index == 0:
             outs.append(0.)
           outs[0] += batch_outs * len(batch_ids)
-
         if verbose == 1:
           progbar.update(batch_end)
       for i in range(len(outs)):
@@ -1484,7 +1561,8 @@ class Model(Network):
           sample_weight=None,
           initial_epoch=0,
           steps_per_epoch=None,
-          validation_steps=None):
+          validation_steps=None,
+          **kwargs):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
@@ -1501,10 +1579,9 @@ class Model(Network):
             dictionary mapping output names to Numpy arrays.
             `y` can be `None` (default) if feeding from
             TensorFlow data tensors.
-            Can be `None` (default) if feeding from framework-native tensors.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
-            If unspecified, it will default to 32.
+            If unspecified, `batch_size` will default to 32.
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -1513,7 +1590,7 @@ class Model(Network):
             The model is not trained for a number of iterations
             given by `epochs`, but merely until the epoch
             of index `epochs` is reached.
-        verbose: 0, 1, or 2. Verbosity mode.
+        verbose: Integer. 0, 1, or 2. Verbosity mode.
             0 = silent, 1 = progress bar, 2 = one line per epoch.
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during training.
@@ -1530,7 +1607,7 @@ class Model(Network):
             `(x_val, y_val, val_sample_weights)` on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
-            This will override `validation_split`.
+            `validation_data` will override `validation_split`.
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -1553,17 +1630,20 @@ class Model(Network):
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
             `sample_weight_mode="temporal"` in `compile()`.
-        initial_epoch: Epoch at which to start training
+        initial_epoch: Integer.
+            Epoch at which to start training
             (useful for resuming a previous training run).
-        steps_per_epoch: Total number of steps (batches of samples)
+        steps_per_epoch: Integer or `None`.
+            Total number of steps (batches of samples)
             before declaring one epoch finished and starting the
             next epoch. When training with input tensors such as
             TensorFlow data tensors, the default `None` is equal to
-            the number of unique samples in your dataset divided by
+            the number of samples in your dataset divided by
             the batch size, or 1 if that cannot be determined.
         validation_steps: Only relevant if `steps_per_epoch`
             is specified. Total number of steps (batches of samples)
             to validate before stopping.
+        **kwargs: Used for backwards compatibility.
 
     Returns:
         A `History` object. Its `History.history` attribute is
@@ -1572,12 +1652,21 @@ class Model(Network):
         and validation metrics values (if applicable).
 
     Raises:
+        RuntimeError: If the model was never compiled.
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
     # Backwards compatibility
     if batch_size is None and steps_per_epoch is None:
       batch_size = 32
+    # Legacy support
+    if 'nb_epoch' in kwargs:
+      logging.warning(
+          'The `nb_epoch` argument in `fit` '
+          'has been renamed `epochs`.')
+      epochs = kwargs.pop('nb_epoch')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
     if x is None and y is None and steps_per_epoch is None:
       raise ValueError('If fitting from data tensors, '
                        'you should specify the `steps_per_epoch` '
@@ -1590,7 +1679,6 @@ class Model(Network):
         class_weight=class_weight,
         check_batch_axis=False,
         batch_size=batch_size)
-
     # Prepare validation data.
     do_validation = False
     val_ins = []
@@ -1644,38 +1732,65 @@ class Model(Network):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    f = self.train_function
 
     # Prepare display labels.
     out_labels = self._get_deduped_metrics_names()
 
-    if do_validation:
-      self._make_test_function()
-      val_f = self.test_function
-      callback_metrics = copy.copy(out_labels) + [
-          'val_' + n for n in out_labels
-      ]
+    if context.in_eager_mode():
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
+
+      return training_eager.fit_loop(
+          self,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
     else:
-      val_f = None
-      callback_metrics = copy.copy(out_labels)
-
-    # Delegate logic to `_fit_loop`.
-    return self._fit_loop(
-        f,
-        ins,
-        out_labels=out_labels,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        val_f=val_f,
-        val_ins=val_ins,
-        shuffle=shuffle,
-        callback_metrics=callback_metrics,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps)
+      self._make_train_function()
+      f = self.train_function
+
+      if do_validation:
+        if context.in_graph_mode():
+          self._make_test_function()
+          val_f = self.test_function
+        else:
+          val_f = None
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        val_f = None
+        callback_metrics = copy.copy(out_labels)
+
+      # Delegate logic to `_fit_loop`.
+      return self._fit_loop(
+          f,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_f=val_f,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
 
   def evaluate(self,
                x=None,
@@ -1694,14 +1809,14 @@ class Model(Network):
             If input layers in the model are named, you can also pass a
             dictionary mapping input names to Numpy arrays.
             `x` can be `None` (default) if feeding from
-            framework-native tensors (e.g. TensorFlow data tensors).
+            TensorFlow data tensors.
         y: Numpy array of target (label) data
             (if the model has a single output),
             or list of Numpy arrays (if the model has multiple outputs).
             If output layers in the model are named, you can also pass a
             dictionary mapping output names to Numpy arrays.
             `y` can be `None` (default) if feeding from
-            framework-native tensors (e.g. TensorFlow data tensors).
+            TensorFlow data tensors.
         batch_size: Integer or `None`.
             Number of samples per evaluation step.
             If unspecified, `batch_size` will default to 32.
@@ -1721,8 +1836,7 @@ class Model(Network):
         steps: Integer or `None`.
             Total number of steps (batches of samples)
             before declaring the evaluation round finished.
-            The default `None` is equal to the number of unique samples in
-            your dataset divided by the batch size.
+            Ignored with the default value of `None`.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1731,7 +1845,7 @@ class Model(Network):
         the display labels for the scalar outputs.
 
     Raises:
-      ValueError: In case of invalid arguments.
+        ValueError: in case of invalid arguments.
     """
     # Backwards compatibility.
     if batch_size is None and steps is None:
@@ -1752,10 +1866,15 @@ class Model(Network):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    f = self.test_function
-    return self._test_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.test_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_test_function()
+      f = self.test_function
+      return self._test_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def predict(self, x, batch_size=None, verbose=0, steps=None):
     """Generates output predictions for the input samples.
@@ -1807,10 +1926,16 @@ class Model(Network):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    f = self.predict_function
-    return self._predict_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.predict_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_predict_function()
+      f = self.predict_function
+
+      return self._predict_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
     """Runs a single gradient update on a single batch of data.
@@ -1846,6 +1971,7 @@ class Model(Network):
         or list of scalars (if the model has multiple outputs
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
+
     """
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1857,11 +1983,16 @@ class Model(Network):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    outputs = self.train_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      return training_eager.train_on_batch(self, ins)
+
+    if context.in_graph_mode():
+      self._make_train_function()
+      outputs = self.train_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def test_on_batch(self, x, y, sample_weight=None):
     """Test the model on a single batch of samples.
@@ -1890,6 +2021,9 @@ class Model(Network):
         or list of scalars (if the model has multiple outputs
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
+
+    Raises:
+        ValueError: in case of invalid arguments.
     """
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, check_batch_axis=True)
@@ -1897,11 +2031,16 @@ class Model(Network):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    outputs = self.test_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      return training_eager.test_on_batch(self, ins)
+
+    if context.in_graph_mode():
+      self._make_test_function()
+      outputs = self.test_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def predict_on_batch(self, x):
     """Returns predictions for a single batch of samples.
@@ -1911,6 +2050,7 @@ class Model(Network):
 
     Returns:
         Numpy array(s) of predictions.
+
     """
     x = _standardize_input_data(x, self._feed_input_names,
                                 self._feed_input_shapes)
@@ -1918,11 +2058,25 @@ class Model(Network):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    outputs = self.predict_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      ins_batch_converted = []
+      for ib in ins:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+      eager_model_inputs = []
+      for i in range(len(self.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      outs = self(eager_model_inputs)  # pylint: disable=not-callable
+      return outs
+
+    if context.in_graph_mode():
+      self._make_predict_function()
+      outputs = self.predict_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def fit_generator(self,
                     generator,
@@ -1937,8 +2091,7 @@ class Model(Network):
                     workers=1,
                     use_multiprocessing=False,
                     shuffle=True,
-                    initial_epoch=0,
-                    **kwargs):
+                    initial_epoch=0):
     """Fits the model on data yielded batch-by-batch by a Python generator.
 
     The generator is run in parallel to the model, for efficiency.
@@ -1950,22 +2103,31 @@ class Model(Network):
     using `use_multiprocessing=True`.
 
     Arguments:
-        generator: A generator or an instance of Sequence (keras.utils.Sequence)
-            object in order to avoid duplicate data when using multiprocessing.
+        generator: A generator or an instance of `Sequence`
+          (`keras.utils.Sequence`)
+            object in order to avoid duplicate data
+            when using multiprocessing.
             The output of the generator must be either
-            - a tuple (inputs, targets)
-            - a tuple (inputs, targets, sample_weights).
-            All arrays should contain the same number of samples.
+            - a tuple `(inputs, targets)`
+            - a tuple `(inputs, targets, sample_weights)`.
+            This tuple (a single output of the generator) makes a single batch.
+            Therefore, all arrays in this tuple must have the same length (equal
+            to the size of this batch). Different batches may have different
+              sizes.
+            For example, the last batch of the epoch is commonly smaller than
+              the
+            others, if the size of the dataset is not divisible by the batch
+              size.
             The generator is expected to loop over its data
             indefinitely. An epoch finishes when `steps_per_epoch`
             batches have been seen by the model.
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
-            be equal to the number of unique samples of your dataset
+            be equal to the number of samples of your dataset
             divided by the batch size.
             Optional for `Sequence`: if unspecified, will use
-            `len(generator)` as a number of steps.
+            the `len(generator)` as a number of steps.
         epochs: Integer, total number of iterations on the data.
         verbose: Verbosity mode, 0, 1, or 2.
         callbacks: List of callbacks to be called during training.
@@ -1977,27 +2139,28 @@ class Model(Network):
             is a generator. Total number of steps (batches of samples)
             to yield from `generator` before stopping.
             Optional for `Sequence`: if unspecified, will use
-            `len(generator)` as a number of steps.
+            the `len(validation_data)` as a number of steps.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
-        max_queue_size: Maximum size for the generator queue.
+        max_queue_size: Integer. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
         workers: Integer. Maximum number of processes to spin up
             when using process based threading.
             If unspecified, `workers` will default to 1. If 0, will
             execute the generator on the main thread.
-        use_multiprocessing: If True, use process based threading.
+        use_multiprocessing: Boolean. If True, use process based threading.
+            If unspecified, `workers` will default to False.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
-        shuffle: Whether to shuffle the data at the beginning of each
-            epoch. Only used with instances of `Sequence`
-            (`keras.utils.Sequence`).
+        shuffle: Whether to shuffle the order of the batches at
+            the beginning of each epoch. Only used with instances
+            of `Sequence` (keras.utils.Sequence).
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
-        **kwargs: support for legacy arguments.
 
     Returns:
         A `History` object.
@@ -2018,24 +2181,10 @@ class Model(Network):
         model.fit_generator(generate_arrays_from_file('/my_file.txt'),
                             steps_per_epoch=10000, epochs=10)
     ```
-
     Raises:
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
     wait_time = 0.01  # in seconds
     epoch = initial_epoch
 
@@ -2046,10 +2195,11 @@ class Model(Network):
 
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning('Using a generator with `use_multiprocessing=True`'
+      logging.warning(
+          UserWarning('Using a generator with `use_multiprocessing=True`'
                       ' and multiple workers may duplicate your data.'
                       ' Please consider using the`keras.utils.Sequence'
-                      ' class.')
+                      ' class.'))
     if steps_per_epoch is None:
       if is_sequence:
         steps_per_epoch = len(generator)
@@ -2098,26 +2248,47 @@ class Model(Network):
     })
     callbacks.on_train_begin()
 
-    if do_validation and not val_gen:
-      if len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weight = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError('`validation_data` should be a tuple '
-                         '`(val_x, val_y, val_sample_weight)` '
-                         'or `(val_x, val_y)`. Found: ' + str(validation_data))
-      val_x, val_y, val_sample_weights = self._standardize_user_data(
-          val_x, val_y, val_sample_weight)
-      val_data = val_x + val_y + val_sample_weights
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        val_data += [0.]
-      for cbk in callbacks:
-        cbk.validation_data = val_data
     enqueuer = None
+    val_enqueuer = None
 
     try:
+      if do_validation:
+        if val_gen:
+          if workers > 0:
+            if isinstance(validation_data, Sequence):
+              val_enqueuer = OrderedEnqueuer(
+                  validation_data, use_multiprocessing=use_multiprocessing)
+              if validation_steps is None:
+                validation_steps = len(validation_data)
+            else:
+              val_enqueuer = GeneratorEnqueuer(
+                  validation_data,
+                  use_multiprocessing=use_multiprocessing,
+                  wait_time=wait_time)
+            val_enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+            validation_generator = val_enqueuer.get()
+          else:
+            validation_generator = validation_data
+        else:
+          if len(validation_data) == 2:
+            val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+            val_sample_weight = None
+          elif len(validation_data) == 3:
+            val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+          else:
+            raise ValueError(
+                '`validation_data` should be a tuple '
+                '`(val_x, val_y, val_sample_weight)` '
+                'or `(val_x, val_y)`. Found: ' + str(validation_data))
+          val_x, val_y, val_sample_weights = self._standardize_user_data(
+              val_x, val_y, val_sample_weight)
+          val_data = val_x + val_y + val_sample_weights
+          if self.uses_learning_phase and not isinstance(
+              K.learning_phase(), int):
+            val_data += [0.]
+          for cbk in callbacks:
+            cbk.validation_data = val_data
+
       if workers > 0:
         if is_sequence:
           enqueuer = OrderedEnqueuer(
@@ -2135,6 +2306,8 @@ class Model(Network):
         output_generator = generator
 
       callback_model.stop_training = False
+      # Construct epoch logs.
+      epoch_logs = {}
       while epoch < epochs:
         callbacks.on_epoch_begin(epoch)
         steps_done = 0
@@ -2178,8 +2351,6 @@ class Model(Network):
 
           callbacks.on_batch_end(batch_index, batch_logs)
 
-          # Construct epoch logs.
-          epoch_logs = {}
           batch_index += 1
           steps_done += 1
 
@@ -2187,11 +2358,7 @@ class Model(Network):
           if steps_done >= steps_per_epoch and do_validation:
             if val_gen:
               val_outs = self.evaluate_generator(
-                  validation_data,
-                  validation_steps,
-                  max_queue_size=max_queue_size,
-                  workers=workers,
-                  use_multiprocessing=use_multiprocessing)
+                  validation_generator, validation_steps, workers=0)
             else:
               # No need for try/except because
               # data has already been validated.
@@ -2216,8 +2383,12 @@ class Model(Network):
           break
 
     finally:
-      if enqueuer is not None:
-        enqueuer.stop()
+      try:
+        if enqueuer is not None:
+          enqueuer.stop()
+      finally:
+        if val_enqueuer is not None:
+          val_enqueuer.stop()
 
     callbacks.on_train_end()
     return self.history
@@ -2227,8 +2398,7 @@ class Model(Network):
                          steps=None,
                          max_queue_size=10,
                          workers=1,
-                         use_multiprocessing=False,
-                         **kwargs):
+                         use_multiprocessing=False):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -2256,7 +2426,6 @@ class Model(Network):
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
-        **kwargs: support for legacy arguments.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -2264,23 +2433,13 @@ class Model(Network):
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
 
+    Raises:
+        ValueError: in case of invalid arguments.
+
     Raises:
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
     self._make_test_function()
 
     steps_done = 0
@@ -2289,10 +2448,11 @@ class Model(Network):
     batch_sizes = []
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning('Using a generator with `use_multiprocessing=True`'
+      logging.warning(
+          UserWarning('Using a generator with `use_multiprocessing=True`'
                       ' and multiple workers may duplicate your data.'
                       ' Please consider using the`keras.utils.Sequence'
-                      ' class.')
+                      ' class.'))
     if steps is None:
       if is_sequence:
         steps = len(generator)
@@ -2368,8 +2528,7 @@ class Model(Network):
                         max_queue_size=10,
                         workers=1,
                         use_multiprocessing=False,
-                        verbose=0,
-                        **kwargs):
+                        verbose=0):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
@@ -2377,9 +2536,9 @@ class Model(Network):
 
     Arguments:
         generator: Generator yielding batches of input samples
-                or an instance of Sequence (keras.utils.Sequence)
-                object in order to avoid duplicate data
-                when using multiprocessing.
+            or an instance of Sequence (keras.utils.Sequence)
+            object in order to avoid duplicate data
+            when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
             Optional for `Sequence`: if unspecified, will use
@@ -2397,7 +2556,6 @@ class Model(Network):
             as they can't be passed
             easily to children processes.
         verbose: verbosity mode, 0 or 1.
-        **kwargs: support for legacy arguments.
 
     Returns:
         Numpy array(s) of predictions.
@@ -2406,17 +2564,6 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-
     self._make_predict_function()
 
     steps_done = 0
@@ -2424,10 +2571,11 @@ class Model(Network):
     all_outs = []
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warn('Using a generator with `use_multiprocessing=True`'
-                   ' and multiple workers may duplicate your data.'
-                   ' Please consider using the`keras.utils.Sequence'
-                   ' class.')
+      logging.warning(
+          UserWarning('Using a generator with `use_multiprocessing=True`'
+                      ' and multiple workers may duplicate your data.'
+                      ' Please consider using the`keras.utils.Sequence'
+                      ' class.'))
     if steps is None:
       if is_sequence:
         steps = len(generator)
@@ -2498,6 +2646,6 @@ class Model(Network):
       else:
         return np.concatenate(all_outs[0])
     if steps_done == 1:
-      return [out for out in all_outs]
+      return [out[0] for out in all_outs]
     else:
       return [np.concatenate(out) for out in all_outs]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a115969ca614d8d50a60f8980fa49bf404cc66f
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -0,0 +1,666 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from tensorflow.python.eager.backprop import GradientTape
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import callbacks as cbks
+from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.keras._impl.keras import metrics as metrics_module
+from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+
+
+def _make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  Arguments:
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+
+  Returns:
+      A list of tuples of array indices.
+  """
+  num_batches = int(np.ceil(size / float(batch_size)))
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(0, num_batches)]
+
+
+def _slice_arrays(arrays, start=None, stop=None):
+  """Slice an array or list of arrays.
+
+  This takes an array-like, or a list of
+  array-likes, and outputs:
+      - arrays[start:stop] if `arrays` is an array-like
+      - [x[start:stop] for x in arrays] if `arrays` is a list
+
+  Can also work on list/array of indices: `_slice_arrays(x, indices)`
+
+  Arguments:
+      arrays: Single array or list of arrays.
+      start: can be an integer index (start index)
+          or a list/array of indices
+      stop: integer (stop index); should be None if
+          `start` was a list.
+
+  Returns:
+      A slice of the array(s).
+
+  Raises:
+      ValueError: If the value of start is a list and stop is not None.
+  """
+  if arrays is None:
+    return [None]
+  if isinstance(start, list) and stop is not None:
+    raise ValueError('The stop argument has to be None if the value of start is'
+                     'a list.')
+  elif isinstance(arrays, list):
+    if hasattr(start, '__len__'):
+      # hdf5 datasets only support list objects as indices
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return [None if x is None else x[start] for x in arrays]
+    else:
+      return [None if x is None else x[start:stop] for x in arrays]
+  else:
+    if hasattr(start, '__len__'):
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return arrays[start]
+    elif hasattr(start, '__getitem__'):
+      return arrays[start:stop]
+    else:
+      return [None]
+
+
+def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
+  if metric == 'accuracy' or metric == 'acc':
+    # custom handling of accuracy
+    # (because of class mode duality)
+    output_shape = internal_output_shapes
+    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
+      # case: binary accuracy
+      acc_fn = metrics_module.binary_accuracy
+    elif loss_func == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      acc_fn = metrics_module.sparse_categorical_accuracy
+    else:
+      acc_fn = metrics_module.categorical_accuracy
+
+    metric_name = 'acc'
+    return metric_name, acc_fn
+  else:
+    metric_fn = metrics_module.get(metric)
+    metric_name = metric_fn.__name__
+    return metric_name, metric_fn
+
+
+def _eager_loss_fn(outputs, targets, loss_fn, output_name):
+  with K.name_scope(output_name + '_loss'):
+    loss = loss_fn(targets, outputs)
+  return loss
+
+
+def _eager_metrics_fn(model, outputs, targets):
+  """Calculates the metrics for each output of the given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      outputs: The outputs of the given model.
+      targets: The predictions or targets of the given model.
+
+  Returns:
+      Returns the metric names and metric results for each output of the model.
+  """
+  metric_names = []
+  metric_results = []
+  if not isinstance(outputs, list):
+    outputs = [outputs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  for i in range(len(model.outputs)):
+    output_metrics = model.nested_metrics[i]
+    for nested_output_metric in output_metrics:
+      metric_name, metric_fn = _get_metrics_info(
+          nested_output_metric, model._internal_output_shapes[i],
+          model.loss_functions[i])
+
+      if len(model.output_names) > 1:
+        metric_name = model.output_names[i] + '_' + metric_name
+        if metric_name not in model.metrics_names:
+          model.metrics_names.append(metric_name)
+
+      with K.name_scope(metric_name):
+        metric_result = metric_fn(outputs[i], targets[i])
+        metric_names.append(metric_name)
+        metric_results.append(K.mean(metric_result))
+
+  return metric_names, metric_results
+
+
+def _model_loss(model, inputs, targets):
+  """Calculates the loss for a given model.
+
+  Arguments:
+     model: The model on which metrics are being calculated.
+     inputs: The inputs of the given model. This is typically the mini batch of
+              data that is fed to the model.
+     targets: The predictions or targets of the given model.
+
+  Returns:
+     Returns the model output, total loss and loss value calculated using the
+     specified loss function. The total loss includes regularization losses and
+     applies masking and sample weighting to the loss value.
+  """
+  total_loss = 0
+  outs = model(inputs)
+  if not isinstance(outs, list):
+    outs = [outs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  loss_metrics = []
+  with K.name_scope('loss'):
+    for i, loss_fn in enumerate(model.loss_functions):
+      # compute the loss
+      output_loss = _eager_loss_fn(outs[i], targets[i], loss_fn,
+                                   model.output_names[i])
+      loss_metrics.append(K.mean(output_loss))
+
+      mask = outs[i]._keras_mask
+      # adapted from weighted_loss_fn
+      if mask is not None:
+        # mask should have the same shape as output_loss
+        output_loss *= mask
+        #  the loss per batch should be proportional
+        #  to the number of unmasked samples.
+        output_loss /= K.mean(mask)
+
+      # adapted from weighted_loss_fn
+      # apply sample weighting
+      if model.sample_weights:
+        # reduce score_array to same ndim as weight array
+        ndim = K.ndim(output_loss)
+        weight_ndim = K.ndim(model.sample_weights)
+        output_loss = K.mean(output_loss, axis=list(range(weight_ndim, ndim)))
+        output_loss *= model.sample_weights
+        output_loss /= K.mean(K.cast(K.not_equal(model.sample_weights, 0),
+                                     K.floatx()))
+        output_loss = K.mean(output_loss)
+
+      loss_weight = model.loss_weights_list[i]
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+
+    total_loss = K.mean(total_loss)
+    # Add regularization losses
+    custom_losses = []
+    for layer in model.layers:
+      if layer.losses:
+        custom_losses += layer.losses
+
+    if custom_losses:
+      total_loss += sum(custom_losses)
+
+  return outs, total_loss, loss_metrics
+
+
+def _process_single_batch(eager_model_inputs, eager_model_outputs, model,
+                          training=True):
+  """Calculate the loss and gradient for one input batch.
+
+     The model weights are updated if training is set to True.
+
+  Arguments:
+      eager_model_inputs: Input batch data.
+      eager_model_outputs: Output batch data.
+      model: Model whose loss has to be calculated.
+      training: The boolean represents if the weights of the model are updated.
+              'fit' methods will set this to True while 'evaluate' methods will
+              set this to False.
+
+  Returns:
+      output of the model, total loss and the loss associated with each output.
+
+  Raises:
+      ValueError: If the model loss is 0 or if the trainable weights list is
+                  empty when the trainable parameter is set to True.
+  """
+  K.set_learning_phase(training)
+  with GradientTape() as tape:
+    outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                           eager_model_outputs)
+    if loss is None:
+      raise ValueError('The model cannot be run '
+                       'because it has no loss to optimize.')
+  if training:
+    if not model._collected_trainable_weights:
+      raise ValueError('The list of trainable weights is empty. Make sure that '
+                       'you are not setting model.trainable to False before '
+                       'compiling the model.')
+    grads = tape.gradient(loss, model._collected_trainable_weights)
+    model.optimizer.apply_gradients(zip(grads,
+                                        model._collected_trainable_weights))
+  return outs, loss, loss_metrics
+
+
+def train_on_batch(model, ins):
+  """Calculates the loss and gradient updates for one input batch.
+
+  Arguments:
+      model: Given model on which loss and gradients are calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss and the loss associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, _ = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model)
+  if not isinstance(outs, list):
+    outs = [outs]
+  _, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + metrics_results
+
+
+def test_on_batch(model, ins):
+  """Calculates the loss for one input batch.
+
+  Arguments:
+      model: Given model on which loss is calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss, loss and metrics associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, loss_metrics = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model, training=False)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metric_names, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  model.metrics_names.append(metric_names)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + loss_metrics + metrics_results
+
+
+def fit_loop(
+    model,
+    ins,
+    out_labels=None,
+    batch_size=None,
+    epochs=100,
+    verbose=1,
+    callbacks=None,
+    val_ins=None,
+    shuffle=True,
+    callback_metrics=None,
+    initial_epoch=0,
+    steps_per_epoch=None,
+    validation_steps=None):
+  """Abstract fit function for `f(ins)`.
+
+  Assume that f returns a list, labeled by out_labels.
+
+  Arguments:
+      model: Instance of the model that is being executed in Eager mode.
+      ins: List of tensors to be fed to `f`
+      out_labels: List of strings, display names of
+          the outputs of `f`
+      batch_size: Integer batch size or None if unknown.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      val_ins: List of tensors to be fed to `val_f`
+      shuffle: Whether to shuffle the data at the beginning of each epoch
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training
+          (useful for resuming a previous training run)
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch. Ignored with the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+
+  Returns:
+      `History` object.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  # Required for Eager mode
+  K.set_learning_phase(True)
+
+  do_validation = False
+  if val_ins:
+    do_validation = True
+    if (verbose and ins and hasattr(ins[0], 'shape') and
+        hasattr(val_ins[0], 'shape')):
+      print('Train on %d samples, validate on %d samples' %
+            (ins[0].shape[0], val_ins[0].shape[0]))
+  if validation_steps:
+    if steps_per_epoch is None:
+      raise ValueError('Can only use `validation_steps` when doing step-wise '
+                       'training, i.e. `steps_per_epoch` must be set.')
+    do_validation = True
+
+  num_train_samples = model._check_num_samples(
+      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
+
+  if num_train_samples is not None:
+    index_array = np.arange(num_train_samples)
+
+  model.history = cbks.History()
+  callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+  if verbose:
+    if steps_per_epoch is not None:
+      count_mode = 'steps'
+    else:
+      count_mode = 'samples'
+    callbacks += [cbks.ProgbarLogger(count_mode)]
+  callbacks = cbks.CallbackList(callbacks)
+  out_labels = out_labels or []
+
+  # it's possible to callback a different model than self
+  # (used by Sequential models)
+  if hasattr(model, 'callback_model') and model.callback_model:
+    callback_model = model.callback_model
+  else:
+    callback_model = model
+
+  callbacks.set_model(callback_model)
+
+  callbacks.set_params({
+      'batch_size': batch_size,
+      'epochs': epochs,
+      'steps': steps_per_epoch,
+      'samples': num_train_samples,
+      'verbose': verbose,
+      'do_validation': do_validation,
+      'metrics': callback_metrics or [],
+  })
+  callbacks.on_train_begin()
+  callback_model.stop_training = False
+  for cbk in callbacks:
+    cbk.validation_data = val_ins
+
+  for epoch in range(initial_epoch, epochs):
+    callbacks.on_epoch_begin(epoch)
+    epoch_logs = {}
+    if shuffle == 'batch':
+      index_array = model._batch_shuffle(index_array, batch_size)
+    elif shuffle:
+      np.random.shuffle(index_array)
+
+    batches = _make_batches(num_train_samples, batch_size)
+
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      try:
+        if isinstance(ins[-1], float):
+          # Do not slice the training phase flag.
+          ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+        else:
+          ins_batch = _slice_arrays(ins, batch_ids)
+      except TypeError:
+        raise TypeError('TypeError while preparing batch. '
+                        'If using HDF5 input data, '
+                        'pass shuffle="batch".')
+      batch_logs = {}
+      batch_logs['batch'] = batch_index
+      batch_logs['size'] = len(batch_ids)
+
+      callbacks.on_batch_begin(batch_index, batch_logs)
+
+      ins_batch_converted = []
+      for ib in ins_batch:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+      eager_model_inputs = []
+      eager_model_outputs = []
+      for i in range(len(model.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      for i in range(len(model.inputs), len(ins_batch_converted)):
+        eager_model_outputs.append(ins_batch_converted[i])
+
+      outs, loss, loss_metrics = _process_single_batch(eager_model_inputs,
+                                                       eager_model_outputs,
+                                                       model)
+
+      if not isinstance(outs, list):
+        outs = [outs]
+
+      for l, o in zip(out_labels, outs):
+        batch_logs[l] = o
+      # Required for Eager mode
+      metrics_names, metrics_results = _eager_metrics_fn(model, outs,
+                                                         eager_model_outputs)
+      batch_logs['loss'] = tensor_util.constant_value(K.mean(loss))
+
+      # TODO(anjalisridhar): Move this to compile to avoid duplicate code.
+      # In graph mode we set the metric names in compile. However in
+      # Eager mode we calculate the metrics for each batch in fit_loop.
+      # We could calculate the metric names and functions in compile.
+      # This would avoid setting the callback parameters separately.
+      # We need to do this for the first iteration alone
+      for m in metrics_names:
+        if m not in callback_metrics:
+          callback_metrics.append(m)
+
+      callbacks.set_params({
+          'batch_size': batch_size,
+          'epochs': epochs,
+          'steps': steps_per_epoch,
+          'samples': num_train_samples,
+          'verbose': verbose,
+          'do_validation': do_validation,
+          'metrics': callback_metrics or [],
+      })
+
+      for k, v in zip(model.metrics_names,
+                      [K.mean(loss)] + loss_metrics + metrics_results):
+        batch_logs[k] = tensor_util.constant_value(v)
+
+      callbacks.on_batch_end(batch_index, batch_logs)
+      if callback_model.stop_training:
+        break
+
+      if batch_index == len(batches) - 1:  # Last batch.
+        if do_validation:
+          val_outs = test_loop(
+              model, val_ins, batch_size=batch_size, verbose=0)
+          if not isinstance(val_outs, list):
+            val_outs = [val_outs]
+          # Same labels assumed.
+          for l, o in zip(out_labels, val_outs):
+            epoch_logs['val_' + l] = o
+    callbacks.on_epoch_end(epoch, epoch_logs)
+    if callback_model.stop_training:
+      break
+  callbacks.on_train_end()
+  return model.history
+
+
+def test_loop(model, ins, batch_size=None, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size or `None`.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring predictions finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  outs = []
+  if verbose == 1:
+    progbar = Progbar(target=num_samples)
+  batches = _make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = _slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    eager_model_outputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    for i in range(len(model.inputs), len(ins_batch_converted)):
+      eager_model_outputs.append(ins_batch_converted[i])
+
+    loss_outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                                eager_model_outputs)
+    _, metrics_results = _eager_metrics_fn(model, loss_outs,
+                                           eager_model_outputs)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [K.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    if isinstance(batch_outs, list):
+      if batch_index == 0:
+        for batch_out in enumerate(batch_outs):
+          outs.append(0.)
+      for i, batch_out in enumerate(batch_outs):
+        outs[i] += batch_out * len(batch_ids)
+    else:
+      if batch_index == 0:
+        outs.append(0.)
+      outs[0] += batch_outs * len(batch_ids)
+
+    if verbose == 1:
+      progbar.update(batch_end)
+  for i in range(len(outs)):
+    outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def predict_loop(model, ins, batch_size=32, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model:
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions
+      (if the model has multiple outputs).
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  if verbose == 1:
+    if steps is not None:
+      progbar = Progbar(target=steps)
+    else:
+      progbar = Progbar(target=num_samples)
+
+  outs = []
+  batches = _make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if ins and isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = _slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    batch_outs = model(eager_model_inputs)
+
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if batch_index == 0:
+      # Pre-allocate the results arrays.
+      for batch_out in batch_outs:
+        dims = batch_out.shape[1:].dims
+        dims_list = [d.value for d in dims]
+        shape = (num_samples,) + tuple(dims_list)
+        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
+    for i, batch_out in enumerate(batch_outs):
+      outs[i][batch_start:batch_end] = batch_out
+    if verbose == 1:
+      progbar.update(batch_end)
+  if len(outs) == 1:
+    return outs[0]
+  return outs
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..81e2f7a5145a586f6a4cc34f54033723fae6a6e9
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -0,0 +1,755 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TrainingTest(test.TestCase):
+
+  def test_fit_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test fit at different verbosity
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+
+    # Test with validation data
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+
+  # Test with validation split
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=0,
+        validation_split=0.2)
+
+    # Test with dictionary inputs
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        validation_data=({'input_a': input_a_np,
+                          'input_b': input_b_np
+                         },
+                         {
+                             'dense': output_d_np,
+                             'dropout': output_e_np
+                         }),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.train_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    }, {'dense': output_d_np,
+        'dropout': output_e_np})
+    # Test with lists for loss, metrics
+    loss = ['mae', 'mse']
+    metrics = ['acc', 'mae']
+    model.compile(optimizer, loss, metrics=metrics)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Test with dictionaries for loss, metrics, loss weights
+    loss = {'dense': 'mse', 'dropout': 'mae'}
+    loss_weights = {'dense': 1., 'dropout': 0.5}
+    metrics = {'dense': 'mse', 'dropout': 'mae'}
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Invalid use cases
+    with self.assertRaises(AttributeError):
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          validation_data=([input_a_np, input_b_np], 0, 0),
+          verbose=0)
+    with self.assertRaises(ValueError):
+      model.train_on_batch({'input_a': input_a_np},
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch([input_a_np], [output_d_np, output_e_np])
+    with self.assertRaises(AttributeError):
+      model.train_on_batch(1, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch(input_a_np, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_input = np.random.random((11, 3))
+      model.train_on_batch([bad_input, input_b_np],
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_target = np.random.random((11, 4))
+      model.train_on_batch([input_a_np, input_b_np],
+                           [bad_target, output_e_np])
+
+    # Build single-input model
+    x = keras.layers.Input(shape=(3,), name='input_a')
+    y = keras.layers.Dense(4)(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    # This will work
+    model.fit([input_a_np], output_d_np, epochs=1)
+    with self.assertRaises(ValueError):
+      model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
+
+  def test_evaluate_predict_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test evaluate at different verbosity
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=0)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=1)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=2)
+    self.assertEqual(len(out), 5)
+    out = model.test_on_batch([input_a_np, input_b_np],
+                              [output_d_np, output_e_np])
+    self.assertEqual(len(out), 5)
+
+    # Test evaluate with dictionary inputs
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=0)
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=1)
+
+    # Test predict
+    out = model.predict([input_a_np, input_b_np], batch_size=5)
+    self.assertEqual(len(out), 2)
+    out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
+    self.assertEqual(len(out), 2)
+    out = model.predict_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    })
+    self.assertEqual(len(out), 2)
+
+  def test_invalid_loss_or_metrics(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    np.random.seed(1337)
+
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+
+    with self.assertRaises(TypeError):
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=set(0))
+
+    with self.assertRaises(ValueError):
+      model.compile(loss=None,
+                    optimizer='rms')
+
+
+class LossWeightingTest(test.TestCase):
+
+  def test_class_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        class_weight=class_weight,
+        validation_data=(x_train, y_train, sample_weight))
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight,
+        validation_split=0.1)
+
+    model.train_on_batch(
+        x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
+    ref_score = model.evaluate(x_test, y_test, verbose=0)
+    score = model.evaluate(
+        x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+    self.assertLess(score, ref_score)
+
+  def test_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(43)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight,
+        validation_split=0.1)
+    model.train_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+    model.test_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+
+  def test_temporal_sample_weights(self):
+    num_classes = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+
+    np.random.seed(1337)
+    (_, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode='temporal')
+
+  def test_class_weight_invalid_use_case(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+
+    del class_weight[1]
+    with self.assertRaises(ValueError):
+      model.fit(x_train, y_train,
+                epochs=0, verbose=0, class_weight=class_weight)
+
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode=[])
+
+    # Build multi-output model
+    x = keras.Input((3,))
+    y1 = keras.layers.Dense(4, name='1')(x)
+    y2 = keras.layers.Dense(4, name='2')(x)
+    model = keras.models.Model(x, [y1, y2])
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    x_np = np.random.random((10, 3))
+    y_np = np.random.random((10, 4))
+    w_np = np.random.random((10,))
+    # This will work
+    model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': w_np})
+    # These will not
+    with self.assertRaises(ValueError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=[w_np])
+    with self.assertRaises(TypeError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=w_np)
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((11,))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+
+
+class TestDynamicTrainability(test.TestCase):
+
+  def test_trainable_warning(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3))
+    model.trainable = False
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    model.trainable = True
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+
+  def test_trainable_argument(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+    # test with nesting
+    inputs = keras.layers.Input(shape=(3,))
+    output = model(inputs)
+    model = keras.models.Model(inputs, output)
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+  def test_layer_trainability_switch(self):
+    # with constructor argument, in Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Sequential
+    model = keras.models.Sequential()
+    layer = keras.layers.Dense(2, input_dim=1)
+    model.add(layer)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # with constructor argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2, trainable=False)(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    layer = keras.layers.Dense(2)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_model_trainability_switch(self):
+    # a non-trainable model has no trainable weights
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    model = keras.models.Model(x, y)
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # same for Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=1))
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_nested_model_trainability(self):
+
+    # a Sequential inside a Model
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Sequential inside a Sequential
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Sequential
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+
+class TestTrainingUtils(test.TestCase):
+
+  def test_check_array_lengths(self):
+    keras.engine.training._check_array_lengths(None, None, None)
+    a_np = np.random.random((4, 3, 3))
+    keras.engine.training._check_array_lengths(a_np, a_np, a_np)
+    keras.engine.training._check_array_lengths(
+        [a_np, a_np], [a_np, a_np], [a_np, a_np])
+    keras.engine.training._check_array_lengths([None], [None], [None])
+
+    b_np = np.random.random((3, 4))
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, None, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, a_np, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [None], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [b_np], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], None, [b_np])
+
+  def test_slice_arrays(self):
+    input_a = np.random.random((10, 3))
+    keras.engine.training._slice_arrays(None)
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = [None, [1, 1], None, [1, 1]]
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = [None]
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = None
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+
+  def test_fit_with_BatchNorm(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_dim=4))
+    model.add(keras.layers.BatchNormalization())
+    model.add(keras.layers.Activation('tanh'))
+    model.add(keras.layers.Dropout(0.2))
+
+    input_a_np = np.random.random((10, 4))
+    output_b_np = np.random.random((10, 10))
+
+    model.compile(loss='binary_crossentropy', optimizer=RMSPropOptimizer(0.001))
+    model.fit(input_a_np, output_b_np, epochs=1, batch_size=5, verbose=0)
+
+  def test_fit_with_regularization(self):
+    model = keras.models.Sequential()
+    with self.assertRaises(ValueError):
+      model.add(
+          keras.layers.Dense(4, input_dim=3,
+                             kernel_regularizer=keras.regularizers.l2(0.01),
+                             activity_regularizer=keras.regularizers.l1(0.01)))
+
+
+if __name__ == '__main__':
+  # Bazel sets these environment variables to very long paths.
+  # Tempfile uses them to create long paths, and in turn multiprocessing
+  # library tries to create sockets named after paths. Delete whatever bazel
+  # writes to these to avoid tests failing due to socket addresses being too
+  # long.
+  for var in ('TMPDIR', 'TMP', 'TEMP'):
+    if var in os.environ:
+      del os.environ[var]
+
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 7650bfb6e80aa581f7c14f3c693106bcd6e73740..b380238e4e2bb3bccbfc5efdc0db213d86910fe5 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -28,6 +28,11 @@ from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.engine.training import _weighted_masked_objective
 from tensorflow.python.platform import test
 
+try:
+  import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
+except ImportError:
+  scipy_sparse = None
+
 
 class TrainingTest(test.TestCase):
 
@@ -73,6 +78,14 @@ class TrainingTest(test.TestCase):
           verbose=2)
       model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
+      # Test model with input data as a list of lists
+      model.fit(
+          [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)],
+          [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+
       # Test with validation data
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
@@ -169,7 +182,7 @@ class TrainingTest(test.TestCase):
       with self.assertRaises(ValueError):
         model.train_on_batch({'input_a': input_a_np},
                              [output_d_np, output_e_np])
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.fit(
             [input_a_np, input_b_np], [output_d_np, output_e_np],
             epochs=1,
@@ -177,7 +190,7 @@ class TrainingTest(test.TestCase):
             verbose=0)
       with self.assertRaises(ValueError):
         model.train_on_batch([input_a_np], [output_d_np, output_e_np])
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.train_on_batch(1, [output_d_np, output_e_np])
       with self.assertRaises(ValueError):
         model.train_on_batch(input_a_np, [output_d_np, output_e_np])
@@ -200,6 +213,16 @@ class TrainingTest(test.TestCase):
       with self.assertRaises(ValueError):
         model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
+      # Test model on a list of floats
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 4))
+
+      model.fit([np.ndarray.tolist(input_a_np)],
+                [np.ndarray.tolist(input_b_np)],
+                epochs=2,
+                batch_size=5,
+                verbose=2)
+
   def test_evaluate_predict_on_arrays(self):
     with self.test_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
@@ -312,6 +335,63 @@ class TrainingTest(test.TestCase):
         model.compile(loss=None,
                       optimizer='rmsprop')
 
+  def test_training_on_sparse_data_with_dense_placeholders(self):
+    if scipy_sparse is None:
+      return
+
+    test_inputs = [
+        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
+    test_outputs = [
+        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
+    in1 = keras.layers.Input(shape=(3,))
+    in2 = keras.layers.Input(shape=(3,))
+    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
+    out2 = keras.layers.Dense(4, name='dense_1')(in2)
+    model = keras.Model([in1, in2], [out1, out2])
+    model.predict(test_inputs, batch_size=2)
+    model.compile('rmsprop', 'mse')
+    model.fit(test_inputs, test_outputs,
+              epochs=1, batch_size=2, validation_split=0.5)
+    model.evaluate(test_inputs, test_outputs, batch_size=2)
+
+  def test_that_trainable_disables_updates(self):
+    val_a = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    with self.test_session():
+      a = keras.layers.Input(shape=(4,))
+      layer = keras.layers.BatchNormalization(input_shape=(4,))
+      b = layer(a)
+      model = keras.Model(a, b)
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+      layer.trainable = False
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
 
 class LossWeightingTest(test.TestCase):
 
@@ -869,25 +949,6 @@ class TestGeneratorMethods(test.TestCase):
                                  use_multiprocessing=False,
                                  workers=0)
 
-        # Test legacy API
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_q_size=10,
-                            workers=4,
-                            pickle_safe=True)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_q_size=10,
-                                workers=2,
-                                pickle_safe=True)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_q_size=10,
-                                 workers=2,
-                                 pickle_safe=True)
-
   def test_generator_methods_with_sample_weights(self):
     arr_data = np.random.random((50, 2))
     arr_labels = np.random.random((50,))
@@ -960,7 +1021,7 @@ class TestGeneratorMethods(test.TestCase):
                             use_multiprocessing=False,
                             validation_data=custom_generator(),
                             validation_steps=10)
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.predict_generator(custom_generator(),
                                 steps=5,
                                 max_queue_size=10,
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index e4b9afd38aa21924693f32b5d0fdf64a97019bce..ffbf77c4b8e4fa4454bfa82e473522ee7a316222 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -14,18 +14,18 @@
 # ==============================================================================
 """Layers that act as activation functions.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class LeakyReLU(Layer):
@@ -61,6 +61,7 @@ class LeakyReLU(Layer):
     base_config = super(LeakyReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -114,9 +115,9 @@ class PReLU(Layer):
     else:
       self.shared_axes = list(shared_axes)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    param_shape = input_shape[1:]
+    param_shape = list(input_shape[1:])
     self.param_broadcast = [False] * len(param_shape)
     if self.shared_axes is not None:
       for i in self.shared_axes:
@@ -140,15 +141,13 @@ class PReLU(Layer):
   def call(self, inputs, mask=None):
     pos = K.relu(inputs)
     if K.backend() == 'theano':
-      neg = (K.pattern_broadcast(self.alpha, self.param_broadcast) *
-             (inputs - K.abs(inputs)) * 0.5)
+      neg = (
+          K.pattern_broadcast(self.alpha, self.param_broadcast) *
+          (inputs - K.abs(inputs)) * 0.5)
     else:
       neg = -self.alpha * K.relu(-inputs)
     return pos + neg
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {
         'alpha_initializer': initializers.serialize(self.alpha_initializer),
@@ -159,6 +158,10 @@ class PReLU(Layer):
     base_config = super(PReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class ELU(Layer):
   """Exponential Linear Unit.
@@ -188,14 +191,15 @@ class ELU(Layer):
   def call(self, inputs):
     return K.elu(inputs, self.alpha)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'alpha': float(self.alpha)}
     base_config = super(ELU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class ThresholdedReLU(Layer):
   """Thresholded Rectified Linear Unit.
@@ -223,12 +227,46 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * K.cast(inputs > self.theta, K.floatx())
+    return inputs * K.cast(K.greater(inputs, self.theta), K.floatx())
+
+  def get_config(self):
+    config = {'theta': float(self.theta)}
+    base_config = super(ThresholdedReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
+
+class Softmax(Layer):
+  """Softmax activation function.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      axis: Integer, axis along which the softmax normalization is applied.
+  """
+
+  def __init__(self, axis=-1, **kwargs):
+    super(Softmax, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.axis = axis
+
+  def call(self, inputs):
+    return activations.softmax(inputs, axis=self.axis)
+
   def get_config(self):
-    config = {'theta': float(self.theta)}
-    base_config = super(ThresholdedReLU, self).get_config()
+    config = {'axis': self.axis}
+    base_config = super(Softmax, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
index 91efab30edf99901b25dc0085b7d49e70d1b6d6d..343b7949accf3f0c9ddc5245910aa5faad8335c6 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
@@ -56,6 +56,12 @@ class AdvancedActivationsTest(test.TestCase):
                                kwargs={'theta': 0.5},
                                input_shape=(2, 3, 4))
 
+  def test_softmax(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.Softmax,
+                               kwargs={'axis': 1},
+                               input_shape=(2, 3, 4))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 22496e8a765d4e86e7ef7ac5a25e6f4af94a28ce..2ee07327751f9f7dcc87aaa83e18bcc1b5991d5c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -563,7 +563,7 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
+class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -711,6 +711,144 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
+  """Depthwise separable 1D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A single integer specifying the spatial
+      dimensions of the filters.
+    strides: A single integer specifying the strides
+      of the convolution.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: A single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(SeparableConv1D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        depthwise_initializer=initializers.get(depthwise_initializer),
+        pointwise_initializer=initializers.get(pointwise_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        depthwise_regularizer=regularizers.get(depthwise_regularizer),
+        pointwise_regularizer=regularizers.get(pointwise_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        depthwise_constraint=constraints.get(depthwise_constraint),
+        pointwise_constraint=constraints.get(pointwise_constraint),
+        bias_constraint=constraints.get(bias_constraint),
+        **kwargs)
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(SeparableConv1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
   """Depthwise separable 2D convolution.
 
@@ -1663,6 +1801,7 @@ class Cropping3D(Layer):
 Convolution1D = Conv1D
 Convolution2D = Conv2D
 Convolution3D = Conv3D
+SeparableConvolution1D = SeparableConv1D
 SeparableConvolution2D = SeparableConv2D
 Convolution2DTranspose = Conv2DTranspose
 Convolution3DTranspose = Conv3DTranspose
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index 4f0e9fc691133ae7f9a7834e17379cb8e25a8a2c..565db19e41fd4ebf334ea950566c8edf6896deaf 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 
@@ -127,10 +127,10 @@ class ConvRecurrent2D(Recurrent):
     self.input_spec = [InputSpec(ndim=5)]
     self.state_spec = None
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       rows = input_shape[3]
       cols = input_shape[4]
@@ -151,30 +151,28 @@ class ConvRecurrent2D(Recurrent):
         dilation=self.dilation_rate[1])
     if self.return_sequences:
       if self.data_format == 'channels_first':
-        output_shape = [input_shape[0], input_shape[1],
-                        self.filters, rows, cols]
+        output_shape = (input_shape[0], input_shape[1], self.filters, rows,
+                        cols)
       elif self.data_format == 'channels_last':
-        output_shape = [input_shape[0], input_shape[1],
-                        rows, cols, self.filters]
+        output_shape = (input_shape[0], input_shape[1], rows, cols,
+                        self.filters)
     else:
       if self.data_format == 'channels_first':
-        output_shape = [input_shape[0], self.filters, rows, cols]
+        output_shape = (input_shape[0], self.filters, rows, cols)
       elif self.data_format == 'channels_last':
-        output_shape = [input_shape[0], rows, cols, self.filters]
+        output_shape = (input_shape[0], rows, cols, self.filters)
 
     if self.return_state:
       if self.data_format == 'channels_first':
-        output_shapes = [output_shape] + [(input_shape[0],
-                                           self.filters,
-                                           rows,
-                                           cols) for _ in range(2)]
+        output_shape = [output_shape] + [
+            (input_shape[0], self.filters, rows, cols) for _ in range(2)
+        ]
       elif self.data_format == 'channels_last':
-        output_shapes = [output_shape] + [(input_shape[0],
-                                           rows,
-                                           cols,
-                                           self.filters) for _ in range(2)]
-      return [tensor_shape.TensorShape(shape) for shape in output_shapes]
-    return tensor_shape.TensorShape(output_shape)
+        output_shape = [output_shape] + [
+            (input_shape[0], rows, cols, self.filters) for _ in range(2)
+        ]
+
+    return output_shape
 
   def get_config(self):
     config = {
@@ -294,11 +292,6 @@ class ConvLSTM2D(ConvRecurrent2D):
   Raises:
       ValueError: in case of invalid constructor arguments.
 
-  References:
-      - [Convolutional LSTM Network: A Machine Learning Approach for
-      Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
-      The current implementation does not include the feedback loop on the
-      cells output
   """
 
   def __init__(self,
@@ -338,7 +331,6 @@ class ConvLSTM2D(ConvRecurrent2D):
         return_sequences=return_sequences,
         go_backwards=go_backwards,
         stateful=stateful,
-        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -352,6 +344,7 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
 
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.recurrent_constraint = constraints.get(recurrent_constraint)
@@ -361,13 +354,12 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
 
+  @shape_type_conversion
   def build(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
     batch_size = input_shape[0] if self.stateful else None
     self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
-
     if self.stateful:
       self.reset_states()
     else:
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
index be7da6f2b409aa57e3f1328441f0e37ede924c11..39c9d4f0fb2751b0eef3b28f6d5b8cb0a93e22e5 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
@@ -311,6 +311,72 @@ class Conv3DTransposeTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+class SeparableConv1DTest(test.TestCase):
+
+  def test_separable_conv_1d(self):
+    num_samples = 2
+    filters = 6
+    stack_size = 3
+    length = 7
+    strides = 1
+
+    for padding in ['valid', 'same']:
+      for multiplier in [1, 2]:
+        if padding == 'same' and strides != 1:
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.SeparableConv1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides,
+                  'depth_multiplier': multiplier
+              },
+              input_shape=(num_samples, length, stack_size))
+
+  def test_separable_conv1d_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_separable_conv1d_constraints(self):
+    d_constraint = lambda x: x
+    p_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': p_constraint,
+        'depthwise_constraint': d_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+
 class SeparableConv2DTest(test.TestCase):
 
   def test_separable_conv_2d(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 6ee3fb48b2f1426b87c5c1947e90d0797e9b9ff7..ea2d3f2f04a591ab97f09dd0a43829fe9f75fc9e 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -23,6 +23,7 @@ import types as python_types
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -119,7 +120,8 @@ class Dropout(tf_core_layers.Dropout, Layer):
     if training is None:
       training = K.learning_phase()
     output = super(Dropout, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    # EagerTensor object has no attribute _uses_learning_phase
+    if not context.in_eager_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 51c520be38f5fac32fec9e4a13c380a2e477c709..f8e31068f8910ff44889963c8c301c39844d85cb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class Embedding(Layer):
@@ -58,13 +58,13 @@ class Embedding(Layer):
     output_dim: int >= 0. Dimension of the dense embedding.
     embeddings_initializer: Initializer for the `embeddings` matrix.
     embeddings_regularizer: Regularizer function applied to
-          the `embeddings` matrix.
+        the `embeddings` matrix.
     embeddings_constraint: Constraint function applied to
-          the `embeddings` matrix.
+        the `embeddings` matrix.
     mask_zero: Whether or not the input value 0 is a special "padding"
         value that should be masked out.
-        This is useful when using recurrent layers,
-        which may take variable length inputs.
+        This is useful when using recurrent layers
+        which may take variable length input.
         If this is `True` then all subsequent layers
         in the model need to support masking or an exception will be raised.
         If mask_zero is set to True, as a consequence, index 0 cannot be
@@ -81,9 +81,6 @@ class Embedding(Layer):
   Output shape:
       3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
 
-  References:
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
-        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -101,19 +98,19 @@ class Embedding(Layer):
         kwargs['input_shape'] = (input_length,)
       else:
         kwargs['input_shape'] = (None,)
-    super(Embedding, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(Embedding, self).__init__(**kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
     self.embeddings_initializer = initializers.get(embeddings_initializer)
     self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.embeddings_constraint = constraints.get(embeddings_constraint)
     self.mask_zero = mask_zero
     self.input_length = input_length
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     self.embeddings = self.add_weight(
         shape=(self.input_dim, self.output_dim),
         initializer=self.embeddings_initializer,
@@ -129,10 +126,10 @@ class Embedding(Layer):
     else:
       return K.not_equal(inputs, 0)
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.input_length is None:
-      return tensor_shape.TensorShape(input_shape + [self.output_dim])
+      return input_shape + (self.output_dim,)
     else:
       # input_length can be tuple if input is 3D or higher
       if isinstance(self.input_length, (list, tuple)):
@@ -149,8 +146,7 @@ class Embedding(Layer):
                        (str(self.input_length), str(input_shape)))
           elif s1 is None:
             in_lens[i] = s2
-      return tensor_shape.TensorShape(
-          (input_shape[0],) + tuple(in_lens) + (self.output_dim,))
+      return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/_impl/keras/layers/local.py
index 0a31b87fb564b2833c0dea1ebb3a977b07f13a24..b844b071e02b4f8f217a09a7d412609f6e2cadeb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/_impl/keras/layers/local.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
@@ -26,6 +25,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 
 
@@ -98,8 +98,7 @@ class LocallyConnected1D(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    super(LocallyConnected1D, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(LocallyConnected1D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
@@ -114,12 +113,13 @@ class LocallyConnected1D(Layer):
     self.bias_initializer = initializers.get(bias_initializer)
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=3)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     input_dim = input_shape[2]
     if input_dim is None:
       raise ValueError('Axis 2 of input should be fully-defined. '
@@ -146,15 +146,14 @@ class LocallyConnected1D(Layer):
     self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
     self.built = True
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
                                            self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, self.filters])
+    return (input_shape[0], length, self.filters)
 
   def call(self, inputs):
     output = K.local_conv1d(inputs, self.kernel, self.kernel_size, self.strides)
-
     if self.use_bias:
       output = K.bias_add(output, self.bias)
     if self.activation is not None:
@@ -163,20 +162,32 @@ class LocallyConnected1D(Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -273,8 +284,7 @@ class LocallyConnected2D(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    super(LocallyConnected2D, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(LocallyConnected2D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
@@ -289,12 +299,13 @@ class LocallyConnected2D(Layer):
     self.bias_initializer = initializers.get(bias_initializer)
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=4)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_last':
       input_row, input_col = input_shape[1:-1]
       input_filter = input_shape[3]
@@ -306,7 +317,6 @@ class LocallyConnected2D(Layer):
                        ' a LocallyConnected2D layer '
                        'should be fully-defined, but layer received '
                        'the inputs shape ' + str(input_shape))
-
     output_row = conv_utils.conv_output_length(input_row, self.kernel_size[0],
                                                self.padding, self.strides[0])
     output_col = conv_utils.conv_output_length(input_col, self.kernel_size[1],
@@ -337,33 +347,30 @@ class LocallyConnected2D(Layer):
       self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
     self.built = True
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       rows = input_shape[2]
       cols = input_shape[3]
     elif self.data_format == 'channels_last':
       rows = input_shape[1]
       cols = input_shape[2]
+
     rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
                                          self.padding, self.strides[0])
     cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
                                          self.padding, self.strides[1])
 
     if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], self.filters, rows, cols])
+      return (input_shape[0], self.filters, rows, cols)
     elif self.data_format == 'channels_last':
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, self.filters])
+      return (input_shape[0], rows, cols, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv2d(inputs,
-                            self.kernel,
-                            self.kernel_size,
-                            self.strides,
+    output = K.local_conv2d(inputs, self.kernel, self.kernel_size, self.strides,
                             (self.output_row, self.output_col),
                             self.data_format)
+
     if self.use_bias:
       output = K.bias_add(output, self.bias, data_format=self.data_format)
 
@@ -372,21 +379,34 @@ class LocallyConnected2D(Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index 76eb03cf274a648da127b9d3e0c911096d361812..38b0b302972a31fb29b516680470c587d75a0781 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -14,15 +14,15 @@
 # ==============================================================================
 # pylint: disable=not-callable
 # pylint: disable=redefined-builtin
-"""Layers can merge several input tensors into a single output tensor.
+"""Layers that can merge several inputs into one.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.topology import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class _Merge(Layer):
@@ -73,12 +73,13 @@ class _Merge(Layer):
         output_shape.append(i)
       else:
         if i != j:
-          raise ValueError('Operands could not be broadcast '
-                           'together with shapes ' + str(shape1) + ' ' +
-                           str(shape2))
+          raise ValueError(
+              'Operands could not be broadcast '
+              'together with shapes ' + str(shape1) + ' ' + str(shape2))
         output_shape.append(i)
     return tuple(output_shape)
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
@@ -87,14 +88,13 @@ class _Merge(Layer):
       raise ValueError('A merge layer should be called '
                        'on a list of at least 2 inputs. '
                        'Got ' + str(len(input_shape)) + ' inputs.')
-    input_shape = [tensor_shape.TensorShape(s).as_list() for s in input_shape]
     batch_sizes = [s[0] for s in input_shape if s is not None]
     batch_sizes = set(batch_sizes)
     batch_sizes -= set([None])
     if len(batch_sizes) > 1:
-      raise ValueError('Can not merge tensors with different '
-                       'batch sizes. Got tensors with shapes : ' +
-                       str(input_shape))
+      raise ValueError(
+          'Can not merge tensors with different '
+          'batch sizes. Got tensors with shapes : ' + str(input_shape))
     if input_shape[0] is None:
       output_shape = None
     else:
@@ -111,9 +111,10 @@ class _Merge(Layer):
       self._reshape_required = False
     else:
       self._reshape_required = True
-    self.built = True
 
   def call(self, inputs):
+    if not isinstance(inputs, list):
+      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
     if self._reshape_required:
       reshaped_inputs = []
       input_ndims = list(map(K.ndim, inputs))
@@ -172,6 +173,7 @@ class _Merge(Layer):
     else:
       return self._merge_function(inputs)
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if input_shape[0] is None:
       output_shape = None
@@ -214,6 +216,22 @@ class Add(_Merge):
   It takes as input a list of tensors,
   all of the same shape, and returns
   a single tensor (also of the same shape).
+
+  Examples:
+
+  ```python
+      import keras
+
+      input1 = keras.layers.Input(shape=(16,))
+      x1 = keras.layers.Dense(8, activation='relu')(input1)
+      input2 = keras.layers.Input(shape=(32,))
+      x2 = keras.layers.Dense(8, activation='relu')(input2)
+      added = keras.layers.Add()([x1, x2])  # equivalent to added =
+      keras.layers.add([x1, x2])
+
+      out = keras.layers.Dense(4)(added)
+      model = keras.models.Model(inputs=[input1, input2], outputs=out)
+  ```
   """
 
   def _merge_function(self, inputs):
@@ -247,10 +265,17 @@ class Subtract(_Merge):
   ```
   """
 
+  @shape_type_conversion
+  def build(self, input_shape):
+    super(Subtract, self).build(input_shape)
+    if len(input_shape) != 2:
+      raise ValueError('A `Subtract` layer should be called '
+                       'on exactly 2 inputs')
+
   def _merge_function(self, inputs):
     if len(inputs) != 2:
-      raise ValueError('`Subtract` layer should be called '
-                       'on exactly 2 inputs. Received: %s' % inputs)
+      raise ValueError('A `Subtract` layer should be called '
+                       'on exactly 2 inputs')
     return inputs[0] - inputs[1]
 
 
@@ -330,47 +355,43 @@ class Concatenate(_Merge):
     super(Concatenate, self).__init__(**kwargs)
     self.axis = axis
     self.supports_masking = True
+    self._reshape_required = False
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
-    if not (isinstance(input_shape, list) and len(input_shape) > 1):
-      raise ValueError('`Concatenate` layer should be called '
-                       'on a list containing at least two inputs')
+    if not isinstance(input_shape, list) or len(input_shape) < 2:
+      raise ValueError('A `Concatenate` layer should be called '
+                       'on a list of at least 2 inputs')
     if all([shape is None for shape in input_shape]):
       return
-    reduced_inputs_shapes = [
-        tensor_shape.TensorShape(shape).as_list() for shape in input_shape
-    ]
+    reduced_inputs_shapes = [list(shape) for shape in input_shape]
     shape_set = set()
     for i in range(len(reduced_inputs_shapes)):
       del reduced_inputs_shapes[i][self.axis]
       shape_set.add(tuple(reduced_inputs_shapes[i]))
     if len(shape_set) > 1:
-      raise ValueError('`Concatenate` layer requires '
+      raise ValueError('A `Concatenate` layer requires '
                        'inputs with matching shapes '
                        'except for the concat axis. '
                        'Got inputs shapes: %s' % (input_shape))
-    self.built = True
 
-  def call(self, inputs):
-    if not isinstance(inputs, list):
-      raise ValueError('A `Concatenate` layer should be called '
-                       'on a list of inputs.')
+  def _merge_function(self, inputs):
     return K.concatenate(inputs, axis=self.axis)
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list):
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of inputs.')
     input_shapes = input_shape
-    output_shape = tensor_shape.TensorShape(input_shapes[0]).as_list()
+    output_shape = list(input_shapes[0])
     for shape in input_shapes[1:]:
-      shape = tensor_shape.TensorShape(shape).as_list()
       if output_shape[self.axis] is None or shape[self.axis] is None:
         output_shape[self.axis] = None
         break
       output_shape[self.axis] += shape[self.axis]
-    return tensor_shape.TensorShape(output_shape)
+    return tuple(output_shape)
 
   def compute_mask(self, inputs, mask=None):
     if mask is None:
@@ -390,7 +411,7 @@ class Concatenate(_Merge):
     masks = []
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
-        # Input is unmasked. Append all 1s to masks
+        # Input is unmasked. Append all 1s to masks,
         masks.append(K.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
@@ -441,14 +462,16 @@ class Dot(_Merge):
     self.axes = axes
     self.normalize = normalize
     self.supports_masking = True
+    self._reshape_required = False
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
-    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
-    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    shape1 = input_shape[0]
+    shape2 = input_shape[1]
     if shape1 is None or shape2 is None:
       return
     if isinstance(self.axes, int):
@@ -462,9 +485,10 @@ class Dot(_Merge):
       raise ValueError('Dimension incompatibility '
                        '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
                        'Layer shapes: %s, %s' % (shape1, shape2))
-    self.built = True
 
-  def call(self, inputs):
+  def _merge_function(self, inputs):
+    if len(inputs) != 2:
+      raise ValueError('A `Dot` layer should be called ' 'on exactly 2 inputs')
     x1 = inputs[0]
     x2 = inputs[1]
     if isinstance(self.axes, int):
@@ -485,12 +509,13 @@ class Dot(_Merge):
     output = K.batch_dot(x1, x2, axes)
     return output
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
-    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
-    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    shape1 = list(input_shape[0])
+    shape2 = list(input_shape[1])
     if isinstance(self.axes, int):
       if self.axes < 0:
         axes = [self.axes % len(shape1), self.axes % len(shape2)]
@@ -504,7 +529,7 @@ class Dot(_Merge):
     output_shape = shape1 + shape2
     if len(output_shape) == 1:
       output_shape += [1]
-    return tensor_shape.TensorShape(output_shape)
+    return tuple(output_shape)
 
   def compute_mask(self, inputs, mask=None):
     return None
@@ -527,6 +552,21 @@ def add(inputs, **kwargs):
 
   Returns:
       A tensor, the sum of the inputs.
+
+  Examples:
+
+  ```python
+      import keras
+
+      input1 = keras.layers.Input(shape=(16,))
+      x1 = keras.layers.Dense(8, activation='relu')(input1)
+      input2 = keras.layers.Input(shape=(32,))
+      x2 = keras.layers.Dense(8, activation='relu')(input2)
+      added = keras.layers.add([x1, x2])
+
+      out = keras.layers.Dense(4)(added)
+      model = keras.models.Model(inputs=[input1, input2], outputs=out)
+  ```
   """
   return Add(**kwargs)(inputs)
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index 459f13145f090f1942543ec2f5da4e9b8cd71509..04fffcc384cc4d868937d92bbd5a8f6505ca1770 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Layers for regularization models via the addition of noise.
+"""Layers that operate regularization via the addition of noise.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class GaussianNoise(Layer):
@@ -59,14 +60,15 @@ class GaussianNoise(Layer):
 
     return K.in_train_phase(noised, inputs, training=training)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'stddev': self.stddev}
     base_config = super(GaussianNoise, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class GaussianDropout(Layer):
   """Apply multiplicative 1-centered Gaussian noise.
@@ -86,10 +88,6 @@ class GaussianDropout(Layer):
   Output shape:
       Same shape as input.
 
-  References:
-      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        Srivastava, Hinton, et al.
-        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
   """
 
   def __init__(self, rate, **kwargs):
@@ -108,14 +106,15 @@ class GaussianDropout(Layer):
       return K.in_train_phase(noised, inputs, training=training)
     return inputs
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'rate': self.rate}
     base_config = super(GaussianDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class AlphaDropout(Layer):
   """Applies Alpha Dropout to the input.
@@ -140,8 +139,6 @@ class AlphaDropout(Layer):
   Output shape:
       Same shape as input.
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
@@ -157,26 +154,34 @@ class AlphaDropout(Layer):
   def call(self, inputs, training=None):
     if 0. < self.rate < 1.:
       noise_shape = self._get_noise_shape(inputs)
-      alpha = 1.6732632423543772848170429916717
-      scale = 1.0507009873554804934193349852946
 
-      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):
+      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):  # pylint: disable=missing-docstring
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
-        kept_idx = K.greater_equal(K.random_uniform(noise_shape, seed=seed),
-                                   rate)
+
+        kept_idx = K.greater_equal(
+            K.random_uniform(noise_shape, seed=seed), rate)
         kept_idx = K.cast(kept_idx, K.floatx())
-        a = ((1 - rate) * (1 + rate * alpha_p ** 2)) ** -0.5
+
+        # Get affine transformation params
+        a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
         b = -a * alpha_p * rate
+
+        # Apply mask
         x = inputs * kept_idx + alpha_p * (1 - kept_idx)
+
+        # Do affine transformation
         return a * x + b
 
       return K.in_train_phase(dropped_inputs, inputs, training=training)
     return inputs
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'rate': self.rate}
     base_config = super(AlphaDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 965ef70e6e6cb488aa4832462da4a2cb43e964a6..eecb14ceaa38968d54ea6702e534ee29b6e180d5 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
@@ -108,7 +109,7 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
     if training is None:
       training = K.learning_phase()
     output = super(BatchNormalization, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    if context.in_graph_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 9ea21c9c363455d693cc4d766b5f94ade56838d9..1b0f6cb6cf20fc55d3edefc9684d423cf25b7e0e 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Recurrent layers.
+"""Recurrent layers and their base classes.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,6 +29,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.platform import tf_logging as logging
 
@@ -109,6 +110,7 @@ class StackedRNNCells(Layer):
       states += cell_states
     return inputs, states
 
+  @shape_type_conversion
   def build(self, input_shape):
     for cell in self.cells:
       if isinstance(cell, Layer):
@@ -117,7 +119,7 @@ class StackedRNNCells(Layer):
         output_dim = cell.state_size[0]
       else:
         output_dim = cell.state_size
-      input_shape = (input_shape[0], input_shape[1], output_dim)
+      input_shape = (input_shape[0], output_dim)
     self.built = True
 
   def get_config(self):
@@ -262,8 +264,7 @@ class RNN(Layer):
           (e.g. via the `input_shape` argument)
 
   Input shape:
-      3D tensor with shape `(batch_size, timesteps, input_dim)`,
-      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+      3D tensor with shape `(batch_size, timesteps, input_dim)`.
 
   Output shape:
       - if `return_state`: a list of tensors. The first tensor is
@@ -370,7 +371,6 @@ class RNN(Layer):
                go_backwards=False,
                stateful=False,
                unroll=False,
-               activity_regularizer=None,
                **kwargs):
     if isinstance(cell, (list, tuple)):
       cell = StackedRNNCells(cell)
@@ -382,8 +382,7 @@ class RNN(Layer):
                        'an attribute `state_size` '
                        '(tuple of integers, '
                        'one integer per RNN state).')
-    super(RNN, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(RNN, self).__init__(**kwargs)
     self.cell = cell
     self.return_sequences = return_sequences
     self.return_state = return_state
@@ -412,15 +411,16 @@ class RNN(Layer):
   def states(self, states):
     self._states = states
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
 
     if hasattr(self.cell.state_size, '__len__'):
-      output_dim = self.cell.state_size[0]
+      state_size = self.cell.state_size
     else:
-      output_dim = self.cell.state_size
+      state_size = [self.cell.state_size]
+    output_dim = state_size[0]
 
     if self.return_sequences:
       output_shape = (input_shape[0], input_shape[1], output_dim)
@@ -428,11 +428,10 @@ class RNN(Layer):
       output_shape = (input_shape[0], output_dim)
 
     if self.return_state:
-      state_shape = [(input_shape[0], output_dim) for _ in self.states]
-      output_shape = [output_shape] + state_shape
+      state_shape = [(input_shape[0], dim) for dim in state_size]
+      return [output_shape] + state_shape
     else:
-      output_shape = output_shape
-    return tensor_shape.TensorShape(output_shape)
+      return output_shape
 
   def compute_mask(self, inputs, mask):
     if isinstance(mask, list):
@@ -444,6 +443,7 @@ class RNN(Layer):
     else:
       return output_mask
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
@@ -454,7 +454,6 @@ class RNN(Layer):
 
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
 
     batch_size = input_shape[0] if self.stateful else None
     input_dim = input_shape[-1]
@@ -478,9 +477,9 @@ class RNN(Layer):
       # initial_state was passed in call, check compatibility
       if [spec.shape[-1] for spec in self.state_spec] != state_size:
         raise ValueError(
-            'An initial_state was passed that is not compatible with '
+            'An `initial_state` was passed that is not compatible with '
             '`cell.state_size`. Received `state_spec`={}; '
-            'However `cell.state_size` is '
+            'however `cell.state_size` is '
             '{}'.format(self.state_spec, self.cell.state_size))
     else:
       self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
@@ -610,7 +609,8 @@ class RNN(Layer):
         constants=constants,
         go_backwards=self.go_backwards,
         mask=mask,
-        unroll=self.unroll)
+        unroll=self.unroll,
+        input_length=timesteps)
     if self.stateful:
       updates = []
       for i in range(len(states)):
@@ -625,6 +625,8 @@ class RNN(Layer):
     # Properly set learning phase
     if getattr(last_output, '_uses_learning_phase', False):
       output._uses_learning_phase = True
+      for state in states:
+        state._uses_learning_phase = True
 
     if self.return_state:
       if not isinstance(states, (list, tuple)):
@@ -636,7 +638,7 @@ class RNN(Layer):
       return output
 
   def _standardize_args(self, inputs, initial_state, constants):
-    """Standardize `__call__` arguments to a single list of tensor inputs.
+    """Standardize `__call__` to a single list of tensor inputs.
 
     When running a model loaded from file, the input tensors
     `initial_state` and `constants` can be passed to `RNN.__call__` as part
@@ -688,7 +690,7 @@ class RNN(Layer):
                        'a `batch_input_shape` '
                        'argument to your first layer.\n'
                        '- If using the functional API, specify '
-                       'the time dimension by passing a '
+                       'the batch size by passing a '
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if self.states[0] is None:
@@ -788,37 +790,26 @@ class SimpleRNNCell(Layer):
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -866,6 +857,7 @@ class SimpleRNNCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     self.kernel = self.add_weight(
         shape=(input_shape[-1], self.units),
@@ -890,33 +882,21 @@ class SimpleRNNCell(Layer):
       self.bias = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
     prev_output = states[0]
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training)
+
     dp_mask = self._dropout_mask
     rec_dp_mask = self._recurrent_dropout_mask
 
@@ -939,46 +919,68 @@ class SimpleRNNCell(Layer):
         output._uses_learning_phase = True
     return output, [output]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(SimpleRNNCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 class SimpleRNN(RNN):
   """Fully-connected RNN where the output is to be fed back to input.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1052,12 +1054,12 @@ class SimpleRNN(RNN):
         go_backwards=go_backwards,
         stateful=stateful,
         unroll=unroll,
-        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(SimpleRNN, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1119,25 +1121,36 @@ class SimpleRNN(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
     }
     base_config = super(SimpleRNN, self).get_config()
     del base_config['cell']
@@ -1155,43 +1168,28 @@ class GRUCell(Layer):
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1249,6 +1247,7 @@ class GRUCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1292,38 +1291,24 @@ class GRUCell(Layer):
       self.bias_h = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(3)
-      ]
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(3)
-      ]
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
     h_tm1 = states[0]  # previous memory
 
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training,
+          count=3)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training,
+          count=3)
+
     # dropout matrices for input units
     dp_mask = self._dropout_mask
     # dropout matrices for recurrent units
@@ -1387,55 +1372,76 @@ class GRUCell(Layer):
         h._uses_learning_phase = True
     return h, [h]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
+    }
+    base_config = super(GRUCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 class GRU(RNN):
-  # pylint: disable=line-too-long
   """Gated Recurrent Unit - Cho et al.
 
   2014.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1465,12 +1471,7 @@ class GRU(RNN):
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
 
-  References:
-      - [On the Properties of Neural Machine Translation: Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
-      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/abs/1412.3555v1)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
   """
-  # pylint: enable=line-too-long
 
   def __init__(self,
                units,
@@ -1528,8 +1529,8 @@ class GRU(RNN):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(GRU, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1599,28 +1600,40 @@ class GRU(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
     }
     base_config = super(GRU, self).get_config()
     del base_config['cell']
@@ -1638,48 +1651,33 @@ class LSTMCell(Layer):
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1739,6 +1737,7 @@ class LSTMCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1798,36 +1797,22 @@ class LSTMCell(Layer):
       self.bias_o = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training,
+          count=4)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training,
+          count=4)
+
     # dropout matrices for input units
     dp_mask = self._dropout_mask
     # dropout matrices for recurrent units
@@ -1901,59 +1886,81 @@ class LSTMCell(Layer):
         h._uses_learning_phase = True
     return h, [h, c]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
+    }
+    base_config = super(LSTMCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 class LSTM(RNN):
-  # pylint: disable=line-too-long
   """Long-Short Term Memory layer - Hochreiter 1997.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1983,13 +1990,7 @@ class LSTM(RNN):
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
 
-  References:
-      - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
-      - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
-      - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
   """
-  # pylint: enable=line-too-long
 
   def __init__(self,
                units,
@@ -2049,8 +2050,8 @@ class LSTM(RNN):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(LSTM, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -2124,29 +2125,42 @@ class LSTM(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'unit_forget_bias': self.unit_forget_bias,
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
     }
     base_config = super(LSTM, self).get_config()
     del base_config['cell']
@@ -2159,6 +2173,23 @@ class LSTM(RNN):
     return cls(**config)
 
 
+def _generate_dropout_ones(inputs, dims):
+  return K.ones((K.shape(inputs)[0], dims))
+
+
+def _generate_dropout_mask(ones, rate, training=None, count=1):
+
+  def dropped_inputs():
+    return K.dropout(ones, rate)
+
+  if count > 1:
+    return [
+        K.in_train_phase(dropped_inputs, ones, training=training)
+        for _ in range(count)
+    ]
+  return K.in_train_phase(dropped_inputs, ones, training=training)
+
+
 class Recurrent(Layer):
   """Deprecated abstract base class for recurrent layers.
 
@@ -2285,6 +2316,7 @@ class Recurrent(Layer):
     self.dropout = 0
     self.recurrent_dropout = 0
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index 7dc4c1db9b4b71775bd3c52a863752b34d9dc3ea..a1407a24ea895976cbf95f0ea5c7ba98335af329 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -392,6 +392,105 @@ class RNNTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 3)
     self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  def test_state_reuse_with_dropout(self):
+    layer_class = keras.layers.SimpleRNN
+    embedding_dim = 4
+    units = 3
+    timesteps = 2
+    num_samples = 2
+
+    with self.test_session():
+      input1 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      layer = layer_class(units,
+                          return_state=True,
+                          return_sequences=True,
+                          dropout=0.2)
+      state = layer(input1)[1:]
+
+      input2 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      output = layer_class(units)(input2, initial_state=state)
+      model = keras.Model([input1, input2], output)
+
+      inputs = [np.random.random((num_samples, timesteps, embedding_dim)),
+                np.random.random((num_samples, timesteps, embedding_dim))]
+      model.predict(inputs)
+
+  def test_builtin_rnn_cell_serialization(self):
+    for cell_class in [keras.layers.SimpleRNNCell,
+                       keras.layers.GRUCell,
+                       keras.layers.LSTMCell]:
+      with self.test_session():
+        # Test basic case.
+        x = keras.Input((None, 5))
+        cell = cell_class(32)
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+
+        # Test basic case serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+        # Test stacking.
+        cells = [cell_class(8),
+                 cell_class(12),
+                 cell_class(32)]
+        layer = keras.layers.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+
+        # Test stacked RNN serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+  def test_stacked_rnn_dropout(self):
+    cells = [keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1),
+             keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1)]
+    layer = keras.layers.RNN(cells)
+
+    with self.test_session():
+      x = keras.Input((None, 5))
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile('sgd', 'mse')
+      x_np = np.random.random((6, 5, 5))
+      y_np = np.random.random((6, 3))
+      model.train_on_batch(x_np, y_np)
+
+  def test_stacked_rnn_compute_output_shape(self):
+    cells = [keras.layers.LSTMCell(3),
+             keras.layers.LSTMCell(6)]
+    embedding_dim = 4
+    timesteps = 2
+    layer = keras.layers.RNN(cells, return_state=True, return_sequences=True)
+    output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
+    expected_output_shape = [(None, timesteps, 6),
+                             (None, 6),
+                             (None, 6),
+                             (None, 3),
+                             (None, 3)]
+    self.assertEqual(
+        [tuple(o.as_list()) for o in output_shape],
+        expected_output_shape)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 452801b65639be217ac26d3caa69f070c776634e..3667956f8049b8f5e9cc9ab7a9e0037c260a8ce1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import utils as tf_layers_util
 
@@ -291,6 +292,7 @@ class Bidirectional(Wrapper):
       self.backward_layer.initial_weights = weights[nw // 2:]
     self.stateful = layer.stateful
     self.return_sequences = layer.return_sequences
+    self.return_state = layer.return_state
     self.supports_masking = True
 
   def get_weights(self):
@@ -301,27 +303,54 @@ class Bidirectional(Wrapper):
     self.forward_layer.set_weights(weights[:nw // 2])
     self.backward_layer.set_weights(weights[nw // 2:])
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
-    if self.merge_mode in ['sum', 'ave', 'mul']:
-      return self.forward_layer.compute_output_shape(input_shape)
-    elif self.merge_mode == 'concat':
-      shape = self.forward_layer.compute_output_shape(input_shape).as_list()
-      shape[-1] *= 2
-      return tensor_shape.TensorShape(shape)
+    output_shape = tuple(self.forward_layer.compute_output_shape(
+        input_shape).as_list())
+    if self.return_state:
+      state_shape = output_shape[1:]
+      output_shape = output_shape[0]
+
+    if self.merge_mode == 'concat':
+      output_shape = list(output_shape)
+      output_shape[-1] *= 2
+      output_shape = tuple(output_shape)
     elif self.merge_mode is None:
-      shape = self.forward_layer.compute_output_shape(input_shape)
-      return [shape, copy.copy(shape)]
+      output_shape = [output_shape, copy.copy(output_shape)]
 
-  def call(self, inputs, training=None, mask=None):
+    if self.return_state:
+      if self.merge_mode is None:
+        return output_shape + state_shape + copy.copy(state_shape)
+      return [output_shape] + state_shape + copy.copy(state_shape)
+    return output_shape
+
+  def call(self, inputs, training=None, mask=None, initial_state=None):
     kwargs = {}
     if has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
     if has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
 
-    y = self.forward_layer.call(inputs, **kwargs)
-    y_rev = self.backward_layer.call(inputs, **kwargs)
+    if initial_state is not None and has_arg(self.layer.call, 'initial_state'):
+      if not isinstance(initial_state, list):
+        raise ValueError(
+            'When passing `initial_state` to a Bidirectional RNN, the state '
+            'should be a list containing the states of the underlying RNNs. '
+            'Found: ' + str(initial_state))
+      forward_state = initial_state[:len(initial_state) // 2]
+      backward_state = initial_state[len(initial_state) // 2:]
+      y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
+      y_rev = self.backward_layer.call(
+          inputs, initial_state=backward_state, **kwargs)
+    else:
+      y = self.forward_layer.call(inputs, **kwargs)
+      y_rev = self.backward_layer.call(inputs, **kwargs)
+
+    if self.return_state:
+      states = y[1:] + y_rev[1:]
+      y = y[0]
+      y_rev = y_rev[0]
+
     if self.return_sequences:
       y_rev = K.reverse(y_rev, 1)
     if self.merge_mode == 'concat':
@@ -343,6 +372,11 @@ class Bidirectional(Wrapper):
           out._uses_learning_phase = True
       else:
         output._uses_learning_phase = True
+
+    if self.return_state:
+      if self.merge_mode is None:
+        return output + states
+      return [output] + states
     return output
 
   def reset_states(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
index 0866c4b0aeddc91ba6eeca6395875b4f2574dbc0..f48c8919a148403874758b618aaa9a662e511240 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
@@ -238,6 +238,131 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
+  def test_Bidirectional_merged_value(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    x = [np.random.rand(samples, timesteps, dim)]
+
+    with self.test_session():
+      for merge_mode in ['sum', 'mul', 'ave', 'concat', None]:
+        if merge_mode == 'sum':
+          merge_func = lambda y, y_rev: y + y_rev
+        elif merge_mode == 'mul':
+          merge_func = lambda y, y_rev: y * y_rev
+        elif merge_mode == 'ave':
+          merge_func = lambda y, y_rev: (y + y_rev) / 2
+        elif merge_mode == 'concat':
+          merge_func = lambda y, y_rev: np.concatenate((y, y_rev), axis=-1)
+        else:
+          merge_func = lambda y, y_rev: [y, y_rev]
+
+        # basic case
+        inputs = keras.Input((timesteps, dim))
+        layer = keras.layers.Bidirectional(
+            rnn(units, return_sequences=True), merge_mode=merge_mode)
+        f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
+        f_forward = keras.backend.function([inputs],
+                                           [layer.forward_layer.call(inputs)])
+        f_backward = keras.backend.function(
+            [inputs],
+            [keras.backend.reverse(layer.backward_layer.call(inputs), 1)])
+
+        y_merged = f_merged(x)
+        y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
+        assert len(y_merged) == len(y_expected)
+        for x1, x2 in zip(y_merged, y_expected):
+          self.assertAllClose(x1, x2, atol=1e-5)
+
+        # test return_state
+        inputs = keras.Input((timesteps, dim))
+        layer = keras.layers.Bidirectional(
+            rnn(units, return_state=True), merge_mode=merge_mode)
+        f_merged = keras.backend.function([inputs], layer(inputs))
+        f_forward = keras.backend.function([inputs],
+                                           layer.forward_layer.call(inputs))
+        f_backward = keras.backend.function([inputs],
+                                            layer.backward_layer.call(inputs))
+        n_states = len(layer.layer.states)
+
+        y_merged = f_merged(x)
+        y_forward = f_forward(x)
+        y_backward = f_backward(x)
+        y_expected = _to_list(merge_func(y_forward[0], y_backward[0]))
+        assert len(y_merged) == len(y_expected) + n_states * 2
+        for x1, x2 in zip(y_merged, y_expected):
+          self.assertAllClose(x1, x2, atol=1e-5)
+
+        y_merged = y_merged[-n_states * 2:]
+        y_forward = y_forward[-n_states:]
+        y_backward = y_backward[-n_states:]
+        for state_birnn, state_inner in zip(y_merged, y_forward + y_backward):
+          self.assertAllClose(state_birnn, state_inner, atol=1e-5)
+
+  def test_Bidirectional_dropout(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'sum'
+    x = [np.random.rand(samples, timesteps, dim)]
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, dim))
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
+      outputs = _to_list(wrapped(inputs, training=True))
+      assert all(not getattr(x, '_uses_learning_phase') for x in outputs)
+
+      inputs = keras.Input((timesteps, dim))
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, dropout=0.2, return_state=True), merge_mode=merge_mode)
+      outputs = _to_list(wrapped(inputs))
+      assert all(x._uses_learning_phase for x in outputs)
+
+      model = keras.Model(inputs, outputs)
+      assert model.uses_learning_phase
+      y1 = _to_list(model.predict(x))
+      y2 = _to_list(model.predict(x))
+      for x1, x2 in zip(y1, y2):
+        self.assertAllClose(x1, x2, atol=1e-5)
+
+  def test_Bidirectional_state_reuse(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, dim))
+      layer = keras.layers.Bidirectional(
+          rnn(units, return_state=True, return_sequences=True))
+      outputs = layer(inputs)
+      output, state = outputs[0], outputs[1:]
+
+      # test passing invalid initial_state: passing a tensor
+      with self.assertRaises(ValueError):
+        output = keras.layers.Bidirectional(
+            rnn(units))(output, initial_state=state[0])
+
+      # test valid usage: passing a list
+      output = keras.layers.Bidirectional(
+          rnn(units))(output, initial_state=state)
+      model = keras.Model(inputs, output)
+      inputs = np.random.rand(samples, timesteps, dim)
+      outputs = model.predict(inputs)
+
+
+def _to_list(ls):
+  if isinstance(ls, list):
+    return ls
+  else:
+    return [ls]
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 1d6319abb13619932fe76966a69004dcfcd0e022..fe0ef54360608db5a7ae95b1ec89e32640db0931 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Built-in Keras loss functions.
+# pylint: disable=unused-import
+"""Built-in loss functions.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -34,7 +35,6 @@ def mean_absolute_error(y_true, y_pred):
 
 
 def mean_absolute_percentage_error(y_true, y_pred):
-  # Equivalent to MAE, but sometimes easier to interpret.
   diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
@@ -56,10 +56,24 @@ def hinge(y_true, y_pred):
 def categorical_hinge(y_true, y_pred):
   pos = K.sum(y_true * y_pred, axis=-1)
   neg = K.max((1. - y_true) * y_pred, axis=-1)
-  return K.maximum(neg - pos + 1., 0.)
+  return K.maximum(0., neg - pos + 1.)
 
 
 def logcosh(y_true, y_pred):
+  """Logarithm of the hyperbolic cosine of the prediction error.
+
+  `log(cosh(x))` is approximately equal to `(x ** 2) / 2` for small `x` and
+  to `abs(x) - log(2)` for large `x`. This means that 'logcosh' works mostly
+  like the mean squared error, but will not be so strongly affected by the
+  occasional wildly incorrect prediction.
+
+  Arguments:
+      y_true: tensor of true targets.
+      y_pred: tensor of predicted targets.
+
+  Returns:
+      Tensor with one scalar loss entry per sample.
+  """
 
   def _logcosh(x):
     return x + K.softplus(-2. * x) - K.log(2.)
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 202048f26d2ad201b4762d3b2b32638f9d041e88..3c18e6826076b553136f5457f59e8bfe081bdb40 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Built-in Keras metrics functions.
+# pylint: disable=unused-import
+"""Built-in metrics.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,7 +22,6 @@ from __future__ import print_function
 import six
 
 from tensorflow.python.keras._impl.keras import backend as K
-# pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
 from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
 from tensorflow.python.keras._impl.keras.losses import cosine_proximity
@@ -35,7 +35,6 @@ from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_
 from tensorflow.python.keras._impl.keras.losses import poisson
 from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
-# pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 
 
@@ -60,8 +59,8 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(K.in_top_k(y_pred,
-                           K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
+  return K.mean(
+      K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
 
 
 # Aliases
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index e262cc8c8e9d728c1e7f504ffaf543faa1f3db50..9cd547200d89184bb7c2aad28b59d2793f093205 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -492,13 +492,13 @@ class Sequential(Model):
         # to the input layer we just created.
         layer(x)
 
-      if len(layer.inbound_nodes[-1].output_tensors) != 1:
+      if len(layer._inbound_nodes[-1].output_tensors) != 1:
         raise ValueError('All layers in a Sequential model '
                          'should have a single output tensor. '
                          'For multi-output layers, '
                          'use the functional API.')
 
-      self.outputs = [layer.inbound_nodes[-1].output_tensors[0]]
+      self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
       self.inputs = topology.get_source_inputs(self.outputs[0])
 
       # We create an input node, which we will keep updated
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/_impl/keras/models_test.py
index edfc0ce0ebc0321589a452e7357c517feeb626cf..04017e4b28b27e52f88a7746fc44510c29edffce 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/_impl/keras/models_test.py
@@ -340,6 +340,35 @@ class TestSequential(test.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  def test_sequential_update_disabling(self):
+    val_a = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+      assert not model.model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+      assert model.model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
 
 class TestModelCloning(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index a08073fa86442e0564aa63052bb87b92dc64cdf6..a55a5e39a69c4286f0a002474b5ad543c04bf256 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras optimizer classes (will eventually be replaced with core optimizers).
+# pylint: disable=invalid-name
+"""Built-in optimizer classes.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -23,6 +24,7 @@ import copy
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
@@ -121,9 +123,9 @@ class Optimizer(object):
     param_values = K.batch_get_value(params)
     for pv, p, w in zip(param_values, params, weights):
       if pv.shape != w.shape:
-        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
+        raise ValueError(
+            'Optimizer weight shape ' + str(pv.shape) + ' not compatible with '
+            'provided weight shape ' + str(w.shape))
       weight_value_tuples.append((p, w))
     K.batch_set_value(weight_value_tuples)
 
@@ -156,7 +158,8 @@ class SGD(Optimizer):
 
   Arguments:
       lr: float >= 0. Learning rate.
-      momentum: float >= 0. Parameter updates momentum.
+      momentum: float >= 0. Parameter that accelerates SGD
+          in the relevant direction and dampens oscillations.
       decay: float >= 0. Learning rate decay over each update.
       nesterov: boolean. Whether to apply Nesterov momentum.
   """
@@ -177,9 +180,8 @@ class SGD(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
-
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
@@ -224,32 +226,33 @@ class RMSprop(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       rho: float >= 0.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
+
   """
 
-  def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
     super(RMSprop, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.rho = K.variable(rho, name='rho')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    accumulators = [
-        K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params
-    ]
+    accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     self.weights = accumulators
     self.updates = [K.update_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
@@ -283,20 +286,19 @@ class Adagrad(Optimizer):
 
   Arguments:
       lr: float >= 0. Learning rate.
-      epsilon: float >= 0.
+      epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adaptive Subgradient Methods for Online Learning and Stochastic
-        Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
   """
 
-  def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
     super(Adagrad, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
@@ -309,8 +311,8 @@ class Adagrad(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       new_a = a + K.square(g)  # update accumulator
@@ -344,20 +346,19 @@ class Adadelta(Optimizer):
       lr: float >= 0. Learning rate.
           It is recommended to leave it at the default value.
       rho: float >= 0.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adadelta - an adaptive learning rate
-        method](http://arxiv.org/abs/1212.5701)
   """
 
-  def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
     super(Adadelta, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.rho = rho
     self.epsilon = epsilon
     self.initial_decay = decay
@@ -372,8 +373,8 @@ class Adadelta(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
@@ -415,20 +416,21 @@ class Adam(Optimizer):
       lr: float >= 0. Learning rate.
       beta_1: float, 0 < beta < 1. Generally close to 1.
       beta_2: float, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
+      amsgrad: boolean. Whether to apply the AMSGrad variant of this
+          algorithm from the paper "On the Convergence of Adam and
+          Beyond".
 
-  References:
-      - [Adam - A Method for Stochastic
-        Optimization](http://arxiv.org/abs/1412.6980v8)
   """
 
   def __init__(self,
                lr=0.001,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                decay=0.,
+               amsgrad=False,
                **kwargs):
     super(Adam, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
@@ -437,8 +439,11 @@ class Adam(Optimizer):
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
       self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
+    self.amsgrad = amsgrad
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
@@ -446,21 +451,30 @@ class Adam(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
-    lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
-                 (1. - K.pow(self.beta_1, t)))
+    lr_t = lr * (
+        K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
 
     ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
-    self.weights = [self.iterations] + ms + vs
+    if self.amsgrad:
+      vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+    else:
+      vhats = [K.zeros(1) for _ in params]
+    self.weights = [self.iterations] + ms + vs + vhats
 
-    for p, g, m, v in zip(params, grads, ms, vs):
+    for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
       v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
-      p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+      if self.amsgrad:
+        vhat_t = K.maximum(vhat, v_t)
+        p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
+        self.updates.append(K.update(vhat, vhat_t))
+      else:
+        p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
 
       self.updates.append(K.update(m, m_t))
       self.updates.append(K.update(v, v_t))
@@ -479,7 +493,8 @@ class Adam(Optimizer):
         'beta_1': float(K.get_value(self.beta_1)),
         'beta_2': float(K.get_value(self.beta_2)),
         'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon
+        'epsilon': self.epsilon,
+        'amsgrad': self.amsgrad
     }
     base_config = super(Adam, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -494,19 +509,16 @@ class Adamax(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adam - A Method for Stochastic
-        Optimization](http://arxiv.org/abs/1412.6980v8)
   """
 
   def __init__(self,
                lr=0.002,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                decay=0.,
                **kwargs):
     super(Adamax, self).__init__(**kwargs)
@@ -516,6 +528,8 @@ class Adamax(Optimizer):
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
       self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
@@ -525,8 +539,8 @@ class Adamax(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
     lr_t = lr / (1. - K.pow(self.beta_1, t))
@@ -580,19 +594,15 @@ class Nadam(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
 
-  References:
-      - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
-      - [On the importance of initialization and momentum in deep
-        learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
   """
 
   def __init__(self,
                lr=0.002,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                schedule_decay=0.004,
                **kwargs):
     super(Nadam, self).__init__(**kwargs)
@@ -602,12 +612,15 @@ class Nadam(Optimizer):
       self.lr = K.variable(lr, name='lr')
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.schedule_decay = schedule_decay
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
     self.updates = [K.update_add(self.iterations, 1)]
+
     t = K.cast(self.iterations, K.floatx()) + 1
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
@@ -668,7 +681,14 @@ class TFOptimizer(Optimizer):
   def __init__(self, optimizer):  # pylint: disable=super-init-not-called
     self.optimizer = optimizer
     with K.name_scope(self.__class__.__name__):
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
+      if context.in_graph_mode():
+        self.iterations = K.variable(0, dtype='int64', name='iterations')
+
+  def apply_gradients(self, grads):
+    self.optimizer.apply_gradients(grads)
+
+  def get_grads(self, loss, params):
+    return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
     grads = self.optimizer.compute_gradients(loss, params)
@@ -691,7 +711,6 @@ class TFOptimizer(Optimizer):
 
 # Aliases.
 
-# pylint: disable=invalid-name
 sgd = SGD
 rmsprop = RMSprop
 adagrad = Adagrad
@@ -700,8 +719,6 @@ adam = Adam
 adamax = Adamax
 nadam = Nadam
 
-# pylint: enable=invalid-name
-
 
 def serialize(optimizer):
   return serialize_keras_object(optimizer)
diff --git a/tensorflow/python/keras/_impl/keras/optimizers_test.py b/tensorflow/python/keras/_impl/keras/optimizers_test.py
index 6e9e4e6c99a6ffb0684d20ca001bba98b0d799bc..57636afbf089f27c00cc56c46fdb3ea50f89cc6b 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers_test.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers_test.py
@@ -102,6 +102,7 @@ class KerasOptimizersTest(test.TestCase):
     with self.test_session():
       _test_optimizer(keras.optimizers.Adam())
       _test_optimizer(keras.optimizers.Adam(decay=1e-3))
+      _test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
   def test_adamax(self):
     with self.test_session():
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index 82441de5925cac0d66af95202c613b3e5e9aeb79..db1fdd4e6b0fd39536f4f69ab396c7e5552710ea 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Fairly basic set of tools for real-time data augmentation on image data.
 
 Can easily be extended to include new transformations,
@@ -28,25 +29,22 @@ import re
 import threading
 
 import numpy as np
-from six.moves import range  # pylint: disable=redefined-builtin
-
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.platform import tf_logging as logging
 
-
-# pylint: disable=g-import-not-at-top
-try:
-  from PIL import Image as pil_image
-except ImportError:
-  pil_image = None
 try:
   from scipy import linalg
   import scipy.ndimage as ndi
 except ImportError:
   linalg = None
   ndi = None
-# pylint: enable=g-import-not-at-top
+
+
+try:
+  from PIL import Image as pil_image
+except ImportError:
+  pil_image = None
 
 if pil_image is not None:
   _PIL_INTERPOLATION_METHODS = {
@@ -88,7 +86,7 @@ def random_rotation(x,
   Returns:
       Rotated Numpy image tensor.
   """
-  theta = np.pi / 180 * np.random.uniform(-rg, rg)
+  theta = np.deg2rad(np.random.uniform(-rg, rg))
   rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
                               [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
 
@@ -145,7 +143,7 @@ def random_shear(x,
 
   Arguments:
       x: Input tensor. Must be 3D.
-      intensity: Transformation intensity.
+      intensity: Transformation intensity in degrees.
       row_axis: Index of axis for rows in the input tensor.
       col_axis: Index of axis for columns in the input tensor.
       channel_axis: Index of axis for channels in the input tensor.
@@ -158,7 +156,7 @@ def random_shear(x,
   Returns:
       Sheared Numpy image tensor.
   """
-  shear = np.random.uniform(-intensity, intensity)
+  shear = np.deg2rad(np.random.uniform(-intensity, intensity))
   shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
                            [0, 0, 1]])
 
@@ -188,8 +186,10 @@ def random_zoom(x,
           (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
       cval: Value used for points outside the boundaries
           of the input if `mode='constant'`.
+
   Returns:
       Zoomed Numpy image tensor.
+
   Raises:
       ValueError: if `zoom_range` isn't a tuple.
   """
@@ -366,7 +366,7 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
       grayscale: Boolean, whether to load the image as grayscale.
       target_size: Either `None` (default to original size)
           or tuple of ints `(img_height, img_width)`.
-     interpolation: Interpolation method used to resample the image if the
+      interpolation: Interpolation method used to resample the image if the
           target size is different from that of the loaded image.
           Supported methods are "nearest", "bilinear", and "bicubic".
           If PIL version 1.1.3 or newer is installed, "lanczos" is also
@@ -394,11 +394,9 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
     width_height_tuple = (target_size[1], target_size[0])
     if img.size != width_height_tuple:
       if interpolation not in _PIL_INTERPOLATION_METHODS:
-        raise ValueError(
-            'Invalid interpolation method {} specified. Supported '
-            'methods are {}'.format(
-                interpolation,
-                ', '.join(_PIL_INTERPOLATION_METHODS.keys())))
+        raise ValueError('Invalid interpolation method {} specified. Supported '
+                         'methods are {}'.format(interpolation, ', '.join(
+                             _PIL_INTERPOLATION_METHODS.keys())))
       resample = _PIL_INTERPOLATION_METHODS[interpolation]
       img = img.resize(width_height_tuple, resample)
   return img
@@ -407,7 +405,8 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
 def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
   return [
       os.path.join(root, f)
-      for root, _, files in os.walk(directory) for f in files
+      for root, _, files in os.walk(directory)
+      for f in files
       if re.match(r'([\w]+\.(?:' + ext + '))', f)
   ]
 
@@ -423,9 +422,9 @@ class ImageDataGenerator(object):
       zca_whitening: apply ZCA whitening.
       zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
       rotation_range: degrees (0 to 180).
-      width_shift_range: fraction of total width.
-      height_shift_range: fraction of total height.
-      shear_range: shear intensity (shear angle in radians).
+      width_shift_range: fraction of total width, if < 1, or pixels if >= 1.
+      height_shift_range: fraction of total height, if < 1, or pixels if >= 1.
+      shear_range: shear intensity (shear angle in degrees).
       zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
           in the range [1-z, 1+z]. A sequence of two can be passed instead
           to select this range.
@@ -433,6 +432,12 @@ class ImageDataGenerator(object):
       fill_mode: points outside the boundaries are filled according to the
           given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
           is 'nearest'.
+          Points outside the boundaries of the input are filled according to the
+            given mode:
+              'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
+              'nearest':  aaaaaaaa|abcd|dddddddd
+              'reflect':  abcddcba|abcd|dcbaabcd
+              'wrap':  abcdabcd|abcd|abcdabcd
       cval: value used for points outside the boundaries when fill_mode is
           'constant'. Default is 0.
       horizontal_flip: whether to randomly flip images horizontally.
@@ -522,6 +527,32 @@ class ImageDataGenerator(object):
       raise ValueError('`zoom_range` should be a float or '
                        'a tuple or list of two floats. '
                        'Received arg: ', zoom_range)
+    if zca_whitening:
+      if not featurewise_center:
+        self.featurewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening`, which overrides '
+                        'setting of `featurewise_center`.')
+      if featurewise_std_normalization:
+        self.featurewise_std_normalization = False
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening` '
+                        'which overrides setting of'
+                        '`featurewise_std_normalization`.')
+    if featurewise_std_normalization:
+      if not featurewise_center:
+        self.featurewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`featurewise_std_normalization`, '
+                        'which overrides setting of '
+                        '`featurewise_center`.')
+    if samplewise_std_normalization:
+      if not samplewise_center:
+        self.samplewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`samplewise_std_normalization`, '
+                        'which overrides setting of '
+                        '`samplewise_center`.')
 
   def flow(self,
            x,
@@ -591,7 +622,7 @@ class ImageDataGenerator(object):
     if self.samplewise_center:
       x -= np.mean(x, keepdims=True)
     if self.samplewise_std_normalization:
-      x /= np.std(x, keepdims=True) + 1e-7
+      x /= (np.std(x, keepdims=True) + K.epsilon())
 
     if self.featurewise_center:
       if self.mean is not None:
@@ -603,7 +634,7 @@ class ImageDataGenerator(object):
                         'first by calling `.fit(numpy_data)`.')
     if self.featurewise_std_normalization:
       if self.std is not None:
-        x /= (self.std + 1e-7)
+        x /= (self.std + K.epsilon())
       else:
         logging.warning('This ImageDataGenerator specifies '
                         '`featurewise_std_normalization`, but it hasn\'t '
@@ -636,7 +667,6 @@ class ImageDataGenerator(object):
     """
     if ndi is None:
       raise ImportError('Scipy is required for image transformations.')
-
     # x is a single image, so it doesn't have image number at index 0
     img_row_axis = self.row_axis - 1
     img_col_axis = self.col_axis - 1
@@ -648,25 +678,27 @@ class ImageDataGenerator(object):
     # use composition of homographies
     # to generate final transform that needs to be applied
     if self.rotation_range:
-      theta = np.pi / 180 * np.random.uniform(-self.rotation_range,
-                                              self.rotation_range)
+      theta = np.deg2rad(
+          np.random.uniform(-self.rotation_range, self.rotation_range))
     else:
       theta = 0
 
     if self.height_shift_range:
-      tx = np.random.uniform(-self.height_shift_range,
-                             self.height_shift_range) * x.shape[img_row_axis]
+      tx = np.random.uniform(-self.height_shift_range, self.height_shift_range)
+      if self.height_shift_range < 1:
+        tx *= x.shape[img_row_axis]
     else:
       tx = 0
 
     if self.width_shift_range:
-      ty = np.random.uniform(-self.width_shift_range,
-                             self.width_shift_range) * x.shape[img_col_axis]
+      ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
+      if self.width_shift_range < 1:
+        ty *= x.shape[img_col_axis]
     else:
       ty = 0
 
     if self.shear_range:
-      shear = np.random.uniform(-self.shear_range, self.shear_range)
+      shear = np.deg2rad(np.random.uniform(-self.shear_range, self.shear_range))
     else:
       shear = 0
 
@@ -744,7 +776,7 @@ class ImageDataGenerator(object):
     if x.ndim != 4:
       raise ValueError('Input to `.fit()` should have rank 4. '
                        'Got array with shape: ' + str(x.shape))
-    if x.shape[self.channel_axis] not in {3, 4}:
+    if x.shape[self.channel_axis] not in {1, 3, 4}:
       logging.warning(
           'Expected input to be images (as Numpy array) '
           'following the data format convention "' + self.data_format + '" '
@@ -784,10 +816,12 @@ class ImageDataGenerator(object):
         raise ImportError('Scipy is required for zca_whitening.')
 
       flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
-      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
-      u, s, _ = linalg.svd(sigma)
-      self.principal_components = np.dot(
-          np.dot(u, np.diag(1. / np.sqrt(s + self.zca_epsilon))), u.T)
+      num_examples = flat_x.shape[0]
+      _, s, vt = linalg.svd(flat_x / np.sqrt(num_examples))
+      s_expand = np.hstack(
+          (s, np.zeros(vt.shape[0] - num_examples, dtype=flat_x.dtype)))
+      self.principal_components = (
+          vt.T / np.sqrt(s_expand**2 + self.zca_epsilon)).dot(vt)
 
 
 class Iterator(Sequence):
@@ -797,10 +831,10 @@ class Iterator(Sequence):
   method.
 
   Arguments:
-    n: Integer, total number of samples in the dataset to loop over.
-    batch_size: Integer, size of a batch.
-    shuffle: Boolean, whether to shuffle the data between epochs.
-    seed: Random seeding for data shuffling.
+      n: Integer, total number of samples in the dataset to loop over.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seeding for data shuffling.
   """
 
   def __init__(self, n, batch_size, shuffle, seed):
@@ -823,15 +857,14 @@ class Iterator(Sequence):
     if idx >= len(self):
       raise ValueError('Asked to retrieve element {idx}, '
                        'but the Sequence '
-                       'has length {length}'.format(idx=idx,
-                                                    length=len(self)))
+                       'has length {length}'.format(idx=idx, length=len(self)))
     if self.seed is not None:
       np.random.seed(self.seed + self.total_batches_seen)
     self.total_batches_seen += 1
     if self.index_array is None:
       self._set_index_array()
-    index_array = self.index_array[self.batch_size * idx:self.batch_size *
-                                   (idx + 1)]
+    index_array = self.index_array[self.batch_size * idx:self.batch_size * (
+        idx + 1)]
     return self._get_batches_of_transformed_samples(index_array)
 
   def __len__(self):
@@ -873,6 +906,7 @@ class Iterator(Sequence):
 
     Arguments:
         index_array: array of sample indices to include in batch.
+
     Returns:
         A batch of transformed samples.
     """
@@ -948,8 +982,8 @@ class NumpyArrayIterator(Iterator):
                                              seed)
 
   def _get_batches_of_transformed_samples(self, index_array):
-    batch_x = np.zeros(tuple([len(index_array)] + list(self.x.shape)[1:]),
-                       dtype=K.floatx())
+    batch_x = np.zeros(
+        tuple([len(index_array)] + list(self.x.shape)[1:]), dtype=K.floatx())
     for i, j in enumerate(index_array):
       x = self.x[j]
       x = self.image_data_generator.random_transform(x.astype(K.floatx()))
@@ -959,7 +993,9 @@ class NumpyArrayIterator(Iterator):
       for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix, index=j, hash=np.random.randint(1e4),
+            prefix=self.save_prefix,
+            index=j,
+            hash=np.random.randint(1e4),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     if self.y is None:
@@ -984,10 +1020,11 @@ class NumpyArrayIterator(Iterator):
 
 def _count_valid_files_in_directory(directory, white_list_formats,
                                     follow_links):
-  """Count files with extension in `white_list_formats` in a directory.
+  """Count files with extension in `white_list_formats` contained in directory.
 
   Arguments:
-      directory: absolute path to the directory containing files to be counted
+      directory: absolute path to the directory
+          containing files to be counted
       white_list_formats: set of strings containing allowed extensions for
           the files to be counted.
       follow_links: boolean.
@@ -1003,7 +1040,7 @@ def _count_valid_files_in_directory(directory, white_list_formats,
 
   samples = 0
   for _, _, files in _recursive_list(directory):
-    for fname in sorted(files):
+    for fname in files:
       is_valid = False
       for extension in white_list_formats:
         if fname.lower().endswith('.' + extension):
@@ -1043,7 +1080,7 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
   subdir = os.path.basename(directory)
   basedir = os.path.dirname(directory)
   for root, _, files in _recursive_list(directory):
-    for fname in files:
+    for fname in sorted(files):
       is_valid = False
       for extension in white_list_formats:
         if fname.lower().endswith('.' + extension):
@@ -1167,8 +1204,8 @@ class DirectoryIterator(Iterator):
         white_list_formats=white_list_formats,
         follow_links=follow_links)
     self.samples = sum(
-        pool.map(function_partial, (os.path.join(directory, subdir)
-                                    for subdir in classes)))
+        pool.map(function_partial,
+                 (os.path.join(directory, subdir) for subdir in classes)))
 
     print('Found %d images belonging to %d classes.' % (self.samples,
                                                         self.num_classes))
@@ -1181,8 +1218,9 @@ class DirectoryIterator(Iterator):
     i = 0
     for dirpath in (os.path.join(directory, subdir) for subdir in classes):
       results.append(
-          pool.apply_async(_list_valid_filenames_in_directory, (
-              dirpath, white_list_formats, self.class_indices, follow_links)))
+          pool.apply_async(
+              _list_valid_filenames_in_directory,
+              (dirpath, white_list_formats, self.class_indices, follow_links)))
     for res in results:
       classes, filenames = res.get()
       self.classes[i:i + len(classes)] = classes
@@ -1199,10 +1237,11 @@ class DirectoryIterator(Iterator):
     # build batch of image data
     for i, j in enumerate(index_array):
       fname = self.filenames[j]
-      img = load_img(os.path.join(self.directory, fname),
-                     grayscale=grayscale,
-                     target_size=self.target_size,
-                     interpolation=self.interpolation)
+      img = load_img(
+          os.path.join(self.directory, fname),
+          grayscale=grayscale,
+          target_size=self.target_size,
+          interpolation=self.interpolation)
       x = img_to_array(img, data_format=self.data_format)
       x = self.image_data_generator.random_transform(x)
       x = self.image_data_generator.standardize(x)
@@ -1212,7 +1251,9 @@ class DirectoryIterator(Iterator):
       for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix, index=j, hash=np.random.randint(1e7),
+            prefix=self.save_prefix,
+            index=j,
+            hash=np.random.randint(1e7),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     # build batch of labels
@@ -1241,4 +1282,3 @@ class DirectoryIterator(Iterator):
     # The transformation of images is not under thread lock
     # so it can be done in parallel
     return self._get_batches_of_transformed_samples(index_array)
-
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
index 642f4f2face5bd56cdc1ed7b4f6d6621c6d1b210..4d59250af03cd5e264fb8a36ba70311840bd68b0 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Preprocessing utilities for sequence data.
+"""Utilities for preprocessing sequence data.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -129,7 +129,7 @@ def make_sampling_table(size, sampling_factor=1e-5):
       is the probability that a word of rank i should be sampled.
   """
   gamma = 0.577
-  rank = np.array(list(range(size)))
+  rank = np.arange(size)
   rank[0] = 1
   inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1. / (12. * rank)
   f = sampling_factor * inv_fq
@@ -170,7 +170,7 @@ def skipgrams(sequence,
           if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
       sampling_table: 1D array of size `vocabulary_size` where the entry i
           encodes the probability to sample a word of rank i.
-      seed: Random seed.
+      seed: random seed.
 
   Returns:
       couples, labels: where `couples` are int pairs and
@@ -224,3 +224,22 @@ def skipgrams(sequence,
     random.shuffle(labels)
 
   return couples, labels
+
+
+def _remove_long_seq(maxlen, seq, label):
+  """Removes sequences that exceed the maximum length.
+
+  Arguments:
+      maxlen: int, maximum length
+      seq: list of lists where each sublist is a sequence
+      label: list where each element is an integer
+
+  Returns:
+      new_seq, new_label: shortened lists for `seq` and `label`.
+  """
+  new_seq, new_label = [], []
+  for x, y in zip(seq, label):
+    if len(x) < maxlen:
+      new_seq.append(x)
+      new_label.append(y)
+  return new_seq, new_label
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
index 47e5aa064fd806196fc9457fc90bc1a26e55ebf3..8f7f25dc0a3e6fd234abb5fc97b3441ddcf16a4e 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities for text input preprocessing.
-
-May benefit from a fast Cython rewrite.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,6 +27,9 @@ import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.platform import tf_logging as logging
+
+
 if sys.version_info < (3,):
   maketrans = string.maketrans
 else:
@@ -68,6 +69,21 @@ def one_hot(text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
+  """One-hot encodes a text into a list of word indexes of size n.
+
+  This is a wrapper to the `hashing_trick` function using `hash` as the
+  hashing function; unicity of word to index mapping non-guaranteed.
+
+  Arguments:
+      text: Input text (string).
+      n: Dimension of the hashing space.
+      filters: Sequence of characters to filter out.
+      lower: Whether to convert the input to lowercase.
+      split: Sentence split marker (string).
+
+  Returns:
+      A list of integer word indices (unicity non-guaranteed).
+  """
   return hashing_trick(
       text, n, hash_function=hash, filters=filters, lower=lower, split=split)
 
@@ -99,6 +115,10 @@ def hashing_trick(text,
 
   Two or more words may be assigned to the same index, due to possible
   collisions by the hashing function.
+  The
+  probability
+  of a collision is in relation to the dimension of the hashing space and
+  the number of distinct objects.
   """
   if hash_function is None:
     hash_function = hash
@@ -127,6 +147,8 @@ class Tokenizer(object):
       lower: boolean. Whether to convert the texts to lowercase.
       split: character or string to use for token splitting.
       char_level: if True, every character will be treated as a token.
+      oov_token: if given, it will be added to word_index and used to
+          replace out-of-vocabulary words during text_to_sequence calls
 
   By default, all punctuation is removed, turning the texts into
   space-separated sequences of words
@@ -141,7 +163,17 @@ class Tokenizer(object):
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
-               char_level=False):
+               char_level=False,
+               oov_token=None,
+               **kwargs):
+    # Legacy support
+    if 'nb_words' in kwargs:
+      logging.warning('The `nb_words` argument in `Tokenizer` '
+                      'has been renamed `num_words`.')
+      num_words = kwargs.pop('nb_words')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
     self.word_counts = OrderedDict()
     self.word_docs = {}
     self.filters = filters
@@ -150,6 +182,7 @@ class Tokenizer(object):
     self.num_words = num_words
     self.document_count = 0
     self.char_level = char_level
+    self.oov_token = oov_token
 
   def fit_on_texts(self, texts):
     """Updates internal vocabulary based on a list of texts.
@@ -181,7 +214,13 @@ class Tokenizer(object):
     sorted_voc = [wc[0] for wc in wcounts]
     # note that index 0 is reserved, never assigned to an existing word
     self.word_index = dict(
-        list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
+        list(zip(sorted_voc, list(range(1,
+                                        len(sorted_voc) + 1)))))
+
+    if self.oov_token is not None:
+      i = self.word_index.get(self.oov_token)
+      if i is None:
+        self.word_index[self.oov_token] = len(self.word_index) + 1
 
     self.index_docs = {}
     for w, c in list(self.word_docs.items()):
@@ -248,6 +287,10 @@ class Tokenizer(object):
             continue
           else:
             vect.append(i)
+        elif self.oov_token is not None:
+          i = self.word_index.get(self.oov_token)
+          if i is not None:
+            vect.append(i)
       yield vect
 
   def texts_to_matrix(self, texts, mode='binary'):
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
index 17ab48ba3fc9dfd553f8f425579c0a37ff42eb84..a934e331c4a14d9bd170258b6b6183e6a15bb561 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
@@ -76,6 +76,22 @@ class TestText(test.TestCase):
     self.assertLessEqual(np.max(encoded), 4)
     self.assertGreaterEqual(np.min(encoded), 1)
 
+  def test_tokenizer_oov_flag(self):
+    x_train = ['This text has only known words']
+    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
+
+    # Defalut, without OOV flag
+    tokenizer = keras.preprocessing.text.Tokenizer()
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 4  # discards 2 OOVs
+
+    # With OOV feature
+    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<unk>')
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 6  # OOVs marked in place
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index 161ff9bf5bf12b3521fe444f1d68bd62b6e8c71d..c53ee8a1aeccbf862324d1e91235ce1a00adb457 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras built-in regularizers.
+"""Built-in regularizers.
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
index d9e8f37e36cff0723c02820e16cc502bb0aea294..fcee9fbcc32d9356f5776b6f53cae6c125313d62 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Utilities for file download and caching."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from abc import abstractmethod
+from contextlib import closing
 import hashlib
 import multiprocessing
 from multiprocessing.pool import ThreadPool
@@ -38,12 +40,12 @@ from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
 
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.util.tf_export import tf_export
+
 
 try:
-  import queue  # pylint:disable=g-import-not-at-top
+  import queue
 except ImportError:
-  import Queue as queue  # pylint:disable=g-import-not-at-top
+  import Queue as queue
 
 
 if sys.version_info[0] == 2:
@@ -87,7 +89,7 @@ if sys.version_info[0] == 2:
       for chunk in chunk_read(response, reporthook=reporthook):
         fd.write(chunk)
 else:
-  from six.moves.urllib.request import urlretrieve  # pylint: disable=g-import-not-at-top
+  from six.moves.urllib.request import urlretrieve
 
 
 def _extract_archive(file_path, path='.', archive_format='auto'):
@@ -136,7 +138,6 @@ def _extract_archive(file_path, path='.', archive_format='auto'):
   return False
 
 
-@tf_export('keras.utils.get_file')
 def get_file(fname,
              origin,
              untar=False,
@@ -188,7 +189,7 @@ def get_file(fname,
       Path to the downloaded file
   """
   if cache_dir is None:
-    cache_dir = os.path.expanduser(os.path.join('~', '.keras'))
+    cache_dir = os.path.join(os.path.expanduser('~'), '.keras')
   if md5_hash is not None and file_hash is None:
     file_hash = md5_hash
     hash_algorithm = 'md5'
@@ -317,37 +318,46 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return False
 
 
-@tf_export('keras.utils.Sequence')
 class Sequence(object):
   """Base object for fitting to a sequence of data, such as a dataset.
 
   Every `Sequence` must implements the `__getitem__` and the `__len__` methods.
   If you want to modify your dataset between epochs you may implement
-  `on_epoch_end`. The method `__getitem__` should return a complete batch.
+  `on_epoch_end`.
+  The method `__getitem__` should return a complete batch.
+
+  # Notes
 
-  Notes:
   `Sequence` are a safer way to do multiprocessing. This structure guarantees
-   that the network will only train once on each sample per epoch which is not
-   the case with generators.
+  that the network will only train once
+   on each sample per epoch which is not the case with generators.
+
   Examples:
+
   ```python
       from skimage.io import imread
       from skimage.transform import resize
       import numpy as np
       import math
+
       # Here, `x_set` is list of path to the images
       # and `y_set` are the associated classes.
+
       class CIFAR10Sequence(Sequence):
+
           def __init__(self, x_set, y_set, batch_size):
               self.x, self.y = x_set, y_set
               self.batch_size = batch_size
+
           def __len__(self):
               return math.ceil(len(self.x) / self.batch_size)
+
           def __getitem__(self, idx):
               batch_x = self.x[idx * self.batch_size:(idx + 1) *
-                        self.batch_size]
+              self.batch_size]
               batch_y = self.y[idx * self.batch_size:(idx + 1) *
-                        self.batch_size]
+              self.batch_size]
+
               return np.array([
                   resize(imread(file_name), (200, 200))
                      for file_name in batch_x]), np.array(batch_y)
@@ -375,7 +385,6 @@ class Sequence(object):
     """
     raise NotImplementedError
 
-  @abstractmethod
   def on_epoch_end(self):
     """Method called at the end of every epoch.
     """
@@ -405,7 +414,6 @@ def get_index(uid, i):
   return _SHARED_SEQUENCES[uid][i]
 
 
-@tf_export('keras.utils.SequenceEnqueuer')
 class SequenceEnqueuer(object):
   """Base class to enqueue inputs.
 
@@ -474,35 +482,36 @@ class OrderedEnqueuer(SequenceEnqueuer):
 
   Arguments:
       sequence: A `keras.utils.data_utils.Sequence` object.
-      use_multiprocessing: Use multiprocessing if True, otherwise threading
-      shuffle: Whether to shuffle the data at the beginning of each epoch
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      shuffle: whether to shuffle the data at the beginning of each epoch
   """
 
   def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
     self.sequence = sequence
     self.use_multiprocessing = use_multiprocessing
 
-    # Doing Multiprocessing.Value += x is not process-safe.
     global _SEQUENCE_COUNTER
     if _SEQUENCE_COUNTER is None:
-      if self.use_multiprocessing:
+      try:
         _SEQUENCE_COUNTER = multiprocessing.Value('i', 0)
-      else:
+      except OSError:
+        # In this case the OS does not allow us to use
+        # multiprocessing. We resort to an int
+        # for enqueuer indexing.
         _SEQUENCE_COUNTER = 0
 
-    if self.use_multiprocessing:
+    if isinstance(_SEQUENCE_COUNTER, int):
+      self.uid = _SEQUENCE_COUNTER
+      _SEQUENCE_COUNTER += 1
+    else:
+      # Doing Multiprocessing.Value += x is not process-safe.
       with _SEQUENCE_COUNTER.get_lock():
         self.uid = _SEQUENCE_COUNTER.value
         _SEQUENCE_COUNTER.value += 1
-    else:
-      self.uid = _SEQUENCE_COUNTER
-      if isinstance(_SEQUENCE_COUNTER, int):
-        _SEQUENCE_COUNTER += 1
-      else:
-        _SEQUENCE_COUNTER.value += 1
+
     self.shuffle = shuffle
     self.workers = 0
-    self.executor = None
+    self.executor_fn = None
     self.queue = None
     self.run_thread = None
     self.stop_signal = None
@@ -519,9 +528,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
             (when full, workers could block on `put()`)
     """
     if self.use_multiprocessing:
-      self.executor = multiprocessing.Pool(workers)
+      self.executor_fn = lambda: multiprocessing.Pool(workers)
     else:
-      self.executor = ThreadPool(workers)
+      self.executor_fn = lambda: ThreadPool(workers)
     self.workers = workers
     self.queue = queue.Queue(max_queue_size)
     self.stop_signal = threading.Event()
@@ -537,24 +546,26 @@ class OrderedEnqueuer(SequenceEnqueuer):
         return
 
   def _run(self):
-    """Function to submit request to the executor & queue `Future` objects."""
+    """Submits request to the executor and queue the `Future` objects."""
     sequence = list(range(len(self.sequence)))
     self._send_sequence()  # Share the initial sequence
     while True:
       if self.shuffle:
         random.shuffle(sequence)
-      for i in sequence:
-        if self.stop_signal.is_set():
-          return
-        self.queue.put(
-            self.executor.apply_async(get_index, (self.uid, i)), block=True)
 
-      # Done with the current epoch, waiting for the final batches
-      self._wait_queue()
+      with closing(self.executor_fn()) as executor:
+        for i in sequence:
+          if self.stop_signal.is_set():
+            return
+          self.queue.put(
+              executor.apply_async(get_index, (self.uid, i)), block=True)
 
-      if self.stop_signal.is_set():
-        # We're done
-        return
+        # Done with the current epoch, waiting for the final batches
+        self._wait_queue()
+
+        if self.stop_signal.is_set():
+          # We're done
+          return
 
       # Call the internal on epoch end.
       self.sequence.on_epoch_end()
@@ -566,8 +577,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
     Skip the data if it is `None`.
 
     Yields:
-        Tuples (inputs, targets)
-        or (inputs, targets, sample_weights)
+        The next element in the queue, i.e. a tuple
+        `(inputs, targets)` or
+        `(inputs, targets, sample_weights)`.
     """
     try:
       while self.is_running():
@@ -581,14 +593,8 @@ class OrderedEnqueuer(SequenceEnqueuer):
 
   def _send_sequence(self):
     """Send current Sequence to all workers."""
-    _SHARED_SEQUENCES[
-        self.uid] = self.sequence  # For new processes that may spawn
-
-    self._close_pool()
-    if self.use_multiprocessing:
-      self.executor = multiprocessing.Pool(self.workers)
-    else:
-      self.executor = ThreadPool(self.workers)
+    # For new processes that may spawn
+    _SHARED_SEQUENCES[self.uid] = self.sequence
 
   def stop(self, timeout=None):
     """Stops running threads and wait for them to exit, if necessary.
@@ -603,16 +609,10 @@ class OrderedEnqueuer(SequenceEnqueuer):
       self.queue.queue.clear()
       self.queue.unfinished_tasks = 0
       self.queue.not_full.notify()
-    self._close_pool()
     self.run_thread.join(timeout)
     _SHARED_SEQUENCES[self.uid] = None
 
-  def _close_pool(self):
-    self.executor.close()
-    self.executor.join()
 
-
-@tf_export('keras.utils.GeneratorEnqueuer')
 class GeneratorEnqueuer(SequenceEnqueuer):
   """Builds a queue out of a data generator.
 
@@ -636,26 +636,53 @@ class GeneratorEnqueuer(SequenceEnqueuer):
                seed=None):
     self.wait_time = wait_time
     self._generator = generator
-    self._use_multiprocessing = use_multiprocessing
+    if os.name is 'nt' and use_multiprocessing is True:
+      # On Windows, avoid **SYSTEMATIC** error in `multiprocessing`:
+      # `TypeError: can't pickle generator objects`
+      # => Suggest multithreading instead of multiprocessing on Windows
+      raise ValueError('Using a generator with `use_multiprocessing=True`'
+                       ' is not supported on Windows (no marshalling of'
+                       ' generators across process boundaries). Instead,'
+                       ' use single thread/process or multithreading.')
+    else:
+      self._use_multiprocessing = use_multiprocessing
     self._threads = []
     self._stop_event = None
     self._manager = None
     self.queue = None
     self.seed = seed
 
-  def start(self, workers=1, max_queue_size=10):
-    """Kicks off threads which add data from the generator into the queue.
-
-    Arguments:
-        workers: number of worker threads
-        max_queue_size: queue size
-            (when full, threads could block on `put()`)
-    """
-
-    def data_generator_task():
+  def _data_generator_task(self):
+    if self._use_multiprocessing is False:
+      while not self._stop_event.is_set():
+        with self.genlock:
+          try:
+            if (self.queue is not None and
+                self.queue.qsize() < self.max_queue_size):
+              # On all OSes, avoid **SYSTEMATIC** error
+              # in multithreading mode:
+              # `ValueError: generator already executing`
+              # => Serialize calls to
+              # infinite iterator/generator's next() function
+              generator_output = next(self._generator)
+              self.queue.put((True, generator_output))
+            else:
+              time.sleep(self.wait_time)
+          except StopIteration:
+            break
+          except Exception as e:  # pylint: disable=broad-except
+            # Can't pickle tracebacks.
+            # As a compromise, print the traceback and pickle None instead.
+            if not hasattr(e, '__traceback__'):
+              setattr(e, '__traceback__', sys.exc_info()[2])
+            self.queue.put((False, e))
+            self._stop_event.set()
+            break
+    else:
       while not self._stop_event.is_set():
         try:
-          if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
+          if (self.queue is not None and
+              self.queue.qsize() < self.max_queue_size):
             generator_output = next(self._generator)
             self.queue.put((True, generator_output))
           else:
@@ -663,24 +690,34 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         except StopIteration:
           break
         except Exception as e:  # pylint: disable=broad-except
-          # Can't pick tracebacks.
+          # Can't pickle tracebacks.
           # As a compromise, print the traceback and pickle None instead.
-          if self._use_multiprocessing:
-            traceback.print_exc()
-            setattr(e, '__traceback__', None)
-          elif not hasattr(e, '__traceback__'):
-            setattr(e, '__traceback__', sys.exc_info()[2])
+          traceback.print_exc()
+          setattr(e, '__traceback__', None)
           self.queue.put((False, e))
           self._stop_event.set()
           break
 
+  def start(self, workers=1, max_queue_size=10):
+    """Kicks off threads which add data from the generator into the queue.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, threads could block on `put()`)
+    """
     try:
+      self.max_queue_size = max_queue_size
       if self._use_multiprocessing:
         self._manager = multiprocessing.Manager()
         self.queue = self._manager.Queue(maxsize=max_queue_size)
         self._stop_event = multiprocessing.Event()
       else:
-        self.queue = queue.Queue()
+        # On all OSes, avoid **SYSTEMATIC** error in multithreading mode:
+        # `ValueError: generator already executing`
+        # => Serialize calls to infinite iterator/generator's next() function
+        self.genlock = threading.Lock()
+        self.queue = queue.Queue(maxsize=max_queue_size)
         self._stop_event = threading.Event()
 
       for _ in range(workers):
@@ -688,12 +725,12 @@ class GeneratorEnqueuer(SequenceEnqueuer):
           # Reset random seed else all children processes
           # share the same seed
           np.random.seed(self.seed)
-          thread = multiprocessing.Process(target=data_generator_task)
+          thread = multiprocessing.Process(target=self._data_generator_task)
           thread.daemon = True
           if self.seed is not None:
             self.seed += 1
         else:
-          thread = threading.Thread(target=data_generator_task)
+          thread = threading.Thread(target=self._data_generator_task)
         self._threads.append(thread)
         thread.start()
     except:
@@ -715,11 +752,15 @@ class GeneratorEnqueuer(SequenceEnqueuer):
       self._stop_event.set()
 
     for thread in self._threads:
-      if thread.is_alive():
-        if self._use_multiprocessing:
+      if self._use_multiprocessing:
+        if thread.is_alive():
           thread.terminate()
-        else:
-          thread.join(timeout)
+      else:
+        # The thread.is_alive() test is subject to a race condition:
+        # the thread could terminate right after the test and before the
+        # join, rendering this test meaningless -> Call thread.join()
+        # always, which is ok no matter what the status of the thread.
+        thread.join(timeout)
 
     if self._manager:
       self._manager.shutdown()
@@ -734,7 +775,9 @@ class GeneratorEnqueuer(SequenceEnqueuer):
     Skip the data if it is `None`.
 
     Yields:
-        Data arrays.
+        The next element in the queue, i.e. a tuple
+        `(inputs, targets)` or
+        `(inputs, targets, sample_weights)`.
     """
     while self.is_running():
       if not self.queue.empty():
@@ -752,7 +795,7 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         else:
           time.sleep(self.wait_time)
 
-      # Make sure to rethrow the first exception in the queue, if any
+    # Make sure to rethrow the first exception in the queue, if any
     while not self.queue.empty():
       success, value = self.queue.get()
       if not success:
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
index a805315c94628f263dd4ce7a8b0f751cdf685ca0..adbe6c3288a3eabb858e78267577ddff6d798972 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import binascii
 import codecs
 import marshal
 import os
@@ -255,7 +256,10 @@ def func_load(code, defaults=None, closure=None, globs=None):
 
   if closure is not None:
     closure = tuple(ensure_value_to_cell(_) for _ in closure)
-  raw_code = codecs.decode(code.encode('ascii'), 'base64')
+  try:
+    raw_code = codecs.decode(code.encode('ascii'), 'base64')
+  except (UnicodeEncodeError, binascii.Error):
+    raw_code = code.encode('raw_unicode_escape')
   code = marshal.loads(raw_code)
   if globs is None:
     globs = globals()
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index e123339f5a7cc629778e2247d985dbe4591da54a..b36c769843d13a910efa6cf8c0d5309e3333f69b 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Utilities related to disk I/O."""
 from __future__ import absolute_import
 from __future__ import division
@@ -21,16 +22,14 @@ from collections import defaultdict
 import sys
 
 import numpy as np
-from tensorflow.python.util.tf_export import tf_export
 
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+  import h5py
 except ImportError:
   h5py = None
 
 
-@tf_export('keras.utils.HDF5Matrix')
 class HDF5Matrix(object):
   """Representation of HDF5 dataset to be used instead of a Numpy array.
 
@@ -65,11 +64,11 @@ class HDF5Matrix(object):
                         'HDF5 and h5py installed.')
 
     if datapath not in list(self.refs.keys()):
-      self._f = h5py.File(datapath)
-      self.refs[datapath] = self._f
+      f = h5py.File(datapath)
+      self.refs[datapath] = f
     else:
-      self._f = self.refs[datapath]
-    self.data = self._f[dataset]
+      f = self.refs[datapath]
+    self.data = f[dataset]
     self.start = start
     if end is None:
       self.end = self.data.shape[0]
@@ -80,9 +79,6 @@ class HDF5Matrix(object):
   def __len__(self):
     return self.end - self.start
 
-  def __del__(self):
-    self._f.close()
-
   def __getitem__(self, key):
     if isinstance(key, slice):
       start, stop = key.start, key.stop
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 30af285cbfb8b8bc38e62d20f0698f9d3c121d10..a2d32424b51ea3160d530a5162ccc269e5815e57 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities related to Keras layers.
+# pylint: disable=protected-access
+"""Utilities related to layer/model functionality.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,17 +23,16 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.conv_utils import convert_kernel
-from tensorflow.python.util.tf_export import tf_export
 
 
 def count_params(weights):
   """Count the total number of scalars composing the weights.
 
   Arguments:
-    weights: An iterable containing the weights on which to compute params
+      weights: An iterable containing the weights on which to compute params
 
   Returns:
-    The total number of scalars composing the weights
+      The total number of scalars composing the weights
   """
   return int(np.sum([K.count_params(p) for p in set(weights)]))
 
@@ -47,10 +47,11 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
           terminal window sizes).
       positions: Relative or absolute positions of log elements in each line.
           If not provided, defaults to `[.33, .55, .67, 1.]`.
-      print_fn: Print function to use (defaults to `print`).
+      print_fn: Print function to use.
           It will be called on each line of the summary.
           You can set it to a custom function
           in order to capture the string summary.
+          It defaults to `print` (prints to stdout).
   """
   if print_fn is None:
     print_fn = print
@@ -59,12 +60,13 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     sequential_like = True
   else:
     sequential_like = True
-    nodes_by_depth = model._nodes_by_depth.values()  # pylint: disable=protected-access
+    nodes_by_depth = model._nodes_by_depth.values()
     nodes = []
     for v in nodes_by_depth:
       if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
-        # If the model has multiple nodes or if the nodes have
-        # multiple inbound_layers, the model is no longer sequential.
+        # if the model has multiple nodes
+        # or if the nodes have multiple inbound_layers
+        # the model is no longer sequential
         sequential_like = False
         break
       nodes += v
@@ -72,7 +74,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       # search for shared layers
       for layer in model.layers:
         flag = False
-        for node in layer.inbound_nodes:
+        for node in layer._inbound_nodes:
           if node in nodes:
             if flag:
               sequential_like = False
@@ -97,7 +99,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     # header names for the different log elements
     to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Connected to']
     relevant_nodes = []
-    for v in model._nodes_by_depth.values():  # pylint: disable=protected-access
+    for v in model._nodes_by_depth.values():
       relevant_nodes += v
 
   def print_row(fields, positions):
@@ -135,7 +137,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     except AttributeError:
       output_shape = 'multiple'
     connections = []
-    for node in layer._inbound_nodes:  # pylint: disable=protected-access
+    for node in layer._inbound_nodes:
       if relevant_nodes and node not in relevant_nodes:
         # node is not part of the current network
         continue
@@ -143,8 +145,8 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
         inbound_layer = node.inbound_layers[i].name
         inbound_node_index = node.node_indices[i]
         inbound_tensor_index = node.tensor_indices[i]
-        connections.append(inbound_layer + '[' + str(inbound_node_index) + ']['
-                           + str(inbound_tensor_index) + ']')
+        connections.append(inbound_layer + '[' + str(inbound_node_index) +
+                           '][' + str(inbound_tensor_index) + ']')
 
     name = layer.name
     cls_name = layer.__class__.__name__
@@ -173,9 +175,9 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     else:
       print_fn('_' * line_length)
 
-  model._check_trainable_weights_consistency()  # pylint: disable=protected-access
+  model._check_trainable_weights_consistency()
   if hasattr(model, '_collected_trainable_weights'):
-    trainable_count = count_params(model._collected_trainable_weights)  # pylint: disable=protected-access
+    trainable_count = count_params(model._collected_trainable_weights)
   else:
     trainable_count = count_params(model.trainable_weights)
 
@@ -188,7 +190,6 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   print_fn('_' * line_length)
 
 
-@tf_export('keras.utils.convert_all_kernels_in_model')
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
index 3dddb99191c8a40adf8f39216679a0975d4e830c..231833e7760f1824df673ce84bc5d77df91721b0 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,10 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.utils.to_categorical')
 def to_categorical(y, num_classes=None):
   """Converts a class vector (integers) to binary class matrix.
 
@@ -50,7 +48,6 @@ def to_categorical(y, num_classes=None):
   return categorical
 
 
-@tf_export('keras.utils.normalize')
 def normalize(x, axis=-1, order=2):
   """Normalizes a Numpy array.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
index 1ec8e3a2bf6d539655b4417cbd413a926978cee2..0c5f2c19c79d5007882dcbc5d14a4cc8dd57ce3d 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,31 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
+# pylint: disable=g-import-not-at-top
 """Utilities related to model visualization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
-from tensorflow.python.util.tf_export import tf_export
+
 
 try:
   # pydot-ng is a fork of pydot that is better maintained.
-  import pydot_ng as pydot  # pylint: disable=g-import-not-at-top
+  import pydot_ng as pydot
 except ImportError:
-  # Fall back on pydot if necessary.
-  # Silence a `print` statement that occurs in case of import error,
-  # by temporarily replacing sys.stdout.
-  _stdout = sys.stdout
-  sys.stdout = sys.stderr
+  # pydotplus is an improved version of pydot
   try:
-    import pydot  # pylint: disable=g-import-not-at-top
+    import pydotplus as pydot
   except ImportError:
-    pydot = None
-  finally:
-    # Restore sys.stdout.
-    sys.stdout = _stdout
+    # Fall back on pydot if necessary.
+    try:
+      import pydot
+    except ImportError:
+      pydot = None
 
 
 def _check_pydot():
@@ -66,8 +64,8 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   Returns:
       A `pydot.Dot` instance representing the Keras model.
   """
-  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.keras._impl.keras.models import Sequential  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
+  from tensorflow.python.keras._impl.keras.models import Sequential
 
   _check_pydot()
   dot = pydot.Dot()
@@ -119,9 +117,9 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   # Connect nodes with edges.
   for layer in layers:
     layer_id = str(id(layer))
-    for i, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
+    for i, node in enumerate(layer._inbound_nodes):
       node_key = layer.name + '_ib-' + str(i)
-      if node_key in model._network_nodes:  # pylint: disable=protected-access
+      if node_key in model._container_nodes:
         for inbound_layer in node.inbound_layers:
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
@@ -129,7 +127,6 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   return dot
 
 
-@tf_export('keras.utils.plot_model')
 def plot_model(model,
                to_file='model.png',
                show_shapes=False,
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
index bc788d874f663caefd46d56fbf715a802fe08ec1..223ceac3deae643fa07594d10e551bea581eb641 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""API wrapper allowing to use certain Keras models with the Scikit-Learn API.
+"""Wrapper for using the Scikit-Learn API with Keras models.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -24,8 +24,8 @@ import types
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.models import Sequential
+from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.util import tf_inspect
 
 
 class BaseWrapper(object):
@@ -75,7 +75,7 @@ class BaseWrapper(object):
     self.check_params(sk_params)
 
   def check_params(self, params):
-    """Checks for user typos in "params".
+    """Checks for user typos in `params`.
 
     Arguments:
         params: dictionary; the parameters to be checked
@@ -95,13 +95,11 @@ class BaseWrapper(object):
     else:
       legal_params_fns.append(self.build_fn)
 
-    legal_params = []
-    for fn in legal_params_fns:
-      legal_params += tf_inspect.getargspec(fn)[0]
-    legal_params = set(legal_params)
-
     for params_name in params:
-      if params_name not in legal_params:
+      for fn in legal_params_fns:
+        if has_arg(fn, params_name):
+          break
+      else:
         if params_name != 'nb_epoch':
           raise ValueError('{} is not a legal parameter'.format(params_name))
 
@@ -136,10 +134,10 @@ class BaseWrapper(object):
 
     Arguments:
         x : array-like, shape `(n_samples, n_features)`
-            Training samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Training samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.fit`
 
@@ -170,21 +168,20 @@ class BaseWrapper(object):
     return history
 
   def filter_sk_params(self, fn, override=None):
-    """Filters `sk_params` and return those in `fn`'s arguments.
+    """Filters `sk_params` and returns those in `fn`'s arguments.
 
     Arguments:
         fn : arbitrary function
-        override: dictionary, values to override sk_params
+        override: dictionary, values to override `sk_params`
 
     Returns:
-        res : dictionary dictionary containing variables
-            in both sk_params and fn's arguments.
+        res : dictionary containing variables
+            in both `sk_params` and `fn`'s arguments.
     """
     override = override or {}
     res = {}
-    fn_args = tf_inspect.getargspec(fn)[0]
     for name, value in self.sk_params.items():
-      if name in fn_args:
+      if has_arg(fn, name):
         res.update({name: value})
     res.update(override)
     return res
@@ -199,10 +196,10 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x : array-like, shape `(n_samples, n_features)`
-            Training samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Training samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.fit`
 
@@ -229,8 +226,8 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments
             of `Sequential.predict_classes`.
@@ -248,8 +245,8 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments
             of `Sequential.predict_classes`.
@@ -258,8 +255,8 @@ class KerasClassifier(BaseWrapper):
         proba: array-like, shape `(n_samples, n_outputs)`
             Class probability estimates.
             In the case of binary classification,
-            tp match the scikit-learn API,
-            will return an array of shape '(n_samples, 2)'
+            to match the scikit-learn API,
+            will return an array of shape `(n_samples, 2)`
             (instead of `(n_sample, 1)` as in Keras).
     """
     kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
@@ -276,16 +273,16 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for x.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.evaluate`.
 
     Returns:
         score: float
-            Mean accuracy of predictions on X wrt. y.
+            Mean accuracy of predictions on `x` wrt. `y`.
 
     Raises:
         ValueError: If the underlying model isn't configured to
@@ -321,8 +318,8 @@ class KerasRegressor(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.predict`.
 
@@ -338,16 +335,16 @@ class KerasRegressor(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y: array-like, shape `(n_samples,)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.evaluate`.
 
     Returns:
         score: float
-            Mean accuracy of predictions on X wrt. y.
+            Mean accuracy of predictions on `x` wrt. `y`.
     """
     kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
     loss = self.model.evaluate(x, y, **kwargs)
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index 34f1435ffb6b65ef0e1399fb6893c3b791616f79..fccedf919a7b261bb30f332172b1388db9da1939 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -18,16 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.applications import densenet
 from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
+from tensorflow.python.keras.applications import nasnet
 from tensorflow.python.keras.applications import resnet50
 from tensorflow.python.keras.applications import vgg16
 from tensorflow.python.keras.applications import vgg19
 from tensorflow.python.keras.applications import xception
+from tensorflow.python.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras.applications.resnet50 import ResNet50
 from tensorflow.python.keras.applications.vgg16 import VGG16
 from tensorflow.python.keras.applications.vgg19 import VGG19
diff --git a/tensorflow/python/keras/applications/densenet/__init__.py b/tensorflow/python/keras/applications/densenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b8ea83920733a3a442171616ab460ffaf831521
--- /dev/null
+++ b/tensorflow/python/keras/applications/densenet/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DenseNet Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.applications.densenet import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
+from tensorflow.python.keras._impl.keras.applications.densenet import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/applications/nasnet/__init__.py b/tensorflow/python/keras/applications/nasnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..94eb145b85b85b2e52ca37e7aebc681c1f054e16
--- /dev/null
+++ b/tensorflow/python/keras/applications/nasnet/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""NASNet Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.applications.nasnet import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
+from tensorflow.python.keras._impl.keras.applications.nasnet import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index b94bf8f0f67a7a8ddbb351d13cb17ccdbf283260..84ee5040dcd7b118a5c63b6532135913fe238797 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras._impl.keras.layers.advanced_activations import Leak
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import PReLU
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import ELU
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import ThresholdedReLU
+from tensorflow.python.keras._impl.keras.layers.advanced_activations import Softmax
 
 # Convolution layers.
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv1D
@@ -37,6 +38,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3DTranspose
+from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
@@ -45,6 +47,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
+from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
 
 # Image processing layers.
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 8c1d16c2a8fc2ed1130d81c46aa233bf8416caf8..3a6058054be4fad05daf20703384d57624d95deb 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1043,6 +1043,7 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -1600,6 +1601,19 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "manip_ops_test",
+    size = "small",
+    srcs = ["manip_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:manip_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
 cuda_py_test(
     name = "matmul_op_test",
     size = "small",
@@ -2821,7 +2835,7 @@ tf_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
-    shard_count = 3,
+    shard_count = 10,
     tags = ["no_windows_gpu"],
 )
 
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index ec6184aacdb1ee6376944114ace3f1c1c1407aa9..7cff3e227c74c246eb4ee158abf0026173b6701a 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -82,7 +82,9 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
       matrix_ph = array_ops.placeholder(dtypes.int32)
       transposed = array_ops.matrix_transpose(matrix_ph)
       self.assertAllEqual(
-          expected_transposed, transposed.eval(feed_dict={matrix_ph: matrix}))
+          expected_transposed, transposed.eval(feed_dict={
+              matrix_ph: matrix
+          }))
 
   def testBatchMatrixDynamicallyDefined(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
@@ -96,7 +98,9 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
       transposed = array_ops.matrix_transpose(batch_matrix_ph)
       self.assertAllEqual(
           expected_transposed,
-          transposed.eval(feed_dict={batch_matrix_ph: batch_matrix}))
+          transposed.eval(feed_dict={
+              batch_matrix_ph: batch_matrix
+          }))
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
@@ -203,8 +207,10 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
 
       masked_tensor = sess.run(
           array_ops.boolean_mask(ph_tensor, ph_mask),
-          feed_dict={ph_tensor: arr,
-                     ph_mask: mask})
+          feed_dict={
+              ph_tensor: arr,
+              ph_mask: mask
+          })
       np.testing.assert_allclose(masked_tensor, arr[mask])
 
   def testMaskDimensionsSetToNoneRaises(self):
@@ -280,7 +286,8 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
       for axis_dtype in [dtypes.int32, dtypes.int64]:
         with self.test_session(use_gpu=use_gpu):
           x_tf = array_ops.reverse_v2(x_np,
-              constant_op.constant([0], dtype=axis_dtype)).eval()
+                                      constant_op.constant(
+                                          [0], dtype=axis_dtype)).eval()
           self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
   def _reverse2DimAuto(self, np_dtype):
@@ -290,16 +297,17 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
       for use_gpu in [False, True]:
         for axis_dtype in [dtypes.int32, dtypes.int64]:
           with self.test_session(use_gpu=use_gpu):
-            x_tf_1 = reverse_f(x_np,
-                constant_op.constant([0], dtype=axis_dtype)).eval()
-            x_tf_2 = reverse_f(x_np,
-                constant_op.constant([-2], dtype=axis_dtype)).eval()
-            x_tf_3 = reverse_f(x_np,
-                constant_op.constant([1], dtype=axis_dtype)).eval()
-            x_tf_4 = reverse_f(x_np,
-                constant_op.constant([-1], dtype=axis_dtype)).eval()
+            x_tf_1 = reverse_f(x_np, constant_op.constant(
+                [0], dtype=axis_dtype)).eval()
+            x_tf_2 = reverse_f(x_np, constant_op.constant(
+                [-2], dtype=axis_dtype)).eval()
+            x_tf_3 = reverse_f(x_np, constant_op.constant(
+                [1], dtype=axis_dtype)).eval()
+            x_tf_4 = reverse_f(x_np, constant_op.constant(
+                [-1], dtype=axis_dtype)).eval()
             x_tf_5 = reverse_f(x_np,
-                constant_op.constant([1, 0], dtype=axis_dtype)).eval()
+                               constant_op.constant([1, 0],
+                                                    dtype=axis_dtype)).eval()
             self.assertAllEqual(x_tf_1, np.asarray(x_np)[::-1, :])
             self.assertAllEqual(x_tf_2, np.asarray(x_np)[::-1, :])
             self.assertAllEqual(x_tf_3, np.asarray(x_np)[:, ::-1])
@@ -324,18 +332,16 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
 
   def testReverse1DimAuto(self):
     for dtype in [
-        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-        np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128,
+        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+        np.float16, np.float32, np.float64, np.complex64, np.complex128,
         np.array(b"").dtype.type
     ]:
       self._reverse1DimAuto(dtype)
 
   def testReverse2DimAuto(self):
     for dtype in [
-        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-        np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128,
+        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+        np.float16, np.float32, np.float64, np.complex64, np.complex128,
         np.array(b"").dtype.type
     ]:
       self._reverse2DimAuto(dtype)
@@ -711,8 +717,8 @@ class GradSliceChecker(object):
     slice_val_grad2, = gradients_impl.gradients(
         slice_val_grad, dy, grad_ys=self.var)
     self.sess.run(assign)
-    slice_val_grad_evaled, slice_val_grad2_evaled = (self.sess.run(
-        [slice_val_grad, slice_val_grad2]))
+    slice_val_grad_evaled, slice_val_grad2_evaled = (
+        self.sess.run([slice_val_grad, slice_val_grad2]))
     analytic_grad2_evaled = analytic_grad2.eval()
     self.test.assertAllEqual(slice_val_grad2_evaled, analytic_grad2_evaled)
 
@@ -946,6 +952,32 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         v = variables.Variable([1, 2])
         sess.run(v[:].assign([1, 2]))
 
+  def testTypeError(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = variables.Variable(init_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_small_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_large_val)
+
+  def testTypeErrorResource(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = resource_variable_ops.ResourceVariable(init_val)
+    with self.test_session() as sess:
+      sess.run(v.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int64"):
+        sess.run(v[:].assign(too_large_val))
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int8"):
+        sess.run(v[:].assign(too_small_val))
+
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
@@ -987,9 +1019,10 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
       self.assertAllEqual(res.get_shape(), [3, 5])
-      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
-                                       [True, True, True, False, False],
-                                       [True, True, False, False, False]])
+      self.assertAllEqual(
+          res.eval(),
+          [[True, False, False, False, False], [True, True, True, False, False],
+           [True, True, False, False, False]])
 
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
@@ -998,17 +1031,17 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       else:
         self.assertAllEqual(res.get_shape().as_list(), [3, None])
-      self.assertAllEqual(res.eval(), [[0.0, 0.0, 0.0,
-                                        0.0], [1.0, 0.0, 0.0, 0.0],
-                                       [1.0, 1.0, 1.0, 1.0]])
+      self.assertAllEqual(
+          res.eval(),
+          [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
   def testTwoDimensional(self):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
       self.assertAllEqual(res.get_shape(), [1, 3, 5])
-      self.assertAllEqual(res.eval(), [[[True, False, False, False, False],
-                                        [True, True, True, False, False],
-                                        [True, True, False, False, False]]])
+      self.assertAllEqual(res.eval(), [[[True, False, False, False, False], [
+          True, True, True, False, False
+      ], [True, True, False, False, False]]])
 
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
@@ -1017,12 +1050,10 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
       else:
         self.assertAllEqual(res.get_shape().as_list(), [2, 3, None])
-      self.assertAllEqual(res.eval(), [[[0.0, 0.0, 0.0, 0.0],
-                                        [1.0, 0.0, 0.0, 0.0],
-                                        [1.0, 1.0, 1.0, 1.0]],
-                                       [[1.0, 0.0, 0.0, 0.0],
-                                        [1.0, 1.0, 0.0, 0.0],
-                                        [1.0, 1.0, 1.0, 0.0]]])
+      self.assertAllEqual(
+          res.eval(),
+          [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
+           [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
 
   def testDtypes(self):
 
@@ -1031,9 +1062,10 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           constant_op.constant([1, 3, 2], dtype=lengths_dtype),
           constant_op.constant(5, dtype=maxlen_dtype))
       self.assertAllEqual(res.get_shape(), [3, 5])
-      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
-                                       [True, True, True, False, False],
-                                       [True, True, False, False, False]])
+      self.assertAllEqual(
+          res.eval(),
+          [[True, False, False, False, False], [True, True, True, False, False],
+           [True, True, False, False, False]])
 
     with self.test_session():
       check_dtypes(dtypes.int32, dtypes.int32)
@@ -1088,13 +1120,14 @@ class PadTest(test_util.TensorFlowTestCase):
   def testEager(self):
     with context.eager_mode():
       t = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-      paddings = constant_op.constant([[1, 1,], [2, 2]])
+      paddings = constant_op.constant([[
+          1,
+          1,
+      ], [2, 2]])
       padded = array_ops.pad(t, paddings, "CONSTANT")
       self.assertAllEqual(padded.numpy(),
-                          [[0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 1, 2, 3, 0, 0],
-                           [0, 0, 4, 5, 6, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0]])
+                          [[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 0, 0],
+                           [0, 0, 4, 5, 6, 0, 0], [0, 0, 0, 0, 0, 0, 0]])
 
 
 class InvertPermutationTest(test_util.TensorFlowTestCase):
@@ -1107,6 +1140,26 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.get_shape(), [5])
         self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
+class UnravelIndexTest(test_util.TensorFlowTestCase):
+
+  def testUnravelIndex(self):
+    with self.test_session():
+      for dtype in [dtypes.int32, dtypes.int64]:
+        indices_1 = constant_op.constant(1621, dtype=dtype)
+        dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_1 = array_ops.unravel_index(indices_1, dims_1)
+        self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+
+        indices_2 = constant_op.constant([1621], dtype=dtype)
+        dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_2 = array_ops.unravel_index(indices_2, dims_2)
+        self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
+
+        indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
+        dims_3 = constant_op.constant([7, 6], dtype=dtype)
+        out_3 = array_ops.unravel_index(indices_3, dims_3)
+        self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
+
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 04248fb2bab4333ed164f7871d2e9d5002dc52ad..2d1b3d9b7e836591646a2d0e59742bf6139446d1 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -81,6 +81,7 @@ class AtrousConvolutionTest(test.TestCase):
       otherwise, it's delayed after the context.
     """
     checks = []
+
     def add_check(check, *args, **kwargs):
       if context.in_eager_mode():
         args_val, kwargs_val = self.evaluate([args, kwargs])
@@ -96,12 +97,12 @@ class AtrousConvolutionTest(test.TestCase):
 
   def _test_atrous_convolution(self, add_check, input_shape, filter_shape,
                                dilation_rate, **kwargs):
-    filters = np.arange(np.prod(filter_shape),
-                        dtype=np.float32).reshape(filter_shape)
+    filters = np.arange(
+        np.prod(filter_shape), dtype=np.float32).reshape(filter_shape)
     filters_upsampled = upsample_filters(filters, dilation_rate)
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
-    y1 = nn_ops.convolution(input=x, filter=filters,
-                            dilation_rate=dilation_rate, **kwargs)
+    y1 = nn_ops.convolution(
+        input=x, filter=filters, dilation_rate=dilation_rate, **kwargs)
     y2 = nn_ops.convolution(input=x, filter=filters_upsampled, **kwargs)
 
     def check(y1_eval, y2_eval):
@@ -112,13 +113,15 @@ class AtrousConvolutionTest(test.TestCase):
   def test_unknown_spatial_dims_for_channel_last_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, None, None, 10])
     w = array_ops.zeros([3, 3, 10, 20])
-    y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
     self.assertEqual(y.shape.as_list(), [1, None, None, 20])
 
   def test_unknown_spatial_dims_for_channel_first_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, 10, None, None])
     w = array_ops.zeros([3, 3, 10, 20])
-    y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW")
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW")
     self.assertEqual(y.shape.as_list(), [1, 20, None, None])
 
   @test_util.run_in_graph_and_eager_modes()
@@ -215,28 +218,35 @@ class AtrousConvolutionTest(test.TestCase):
 
               def combined_op(converted_input, num_spatial_dims, padding_arg):  # pylint: disable=unused-argument
                 # pylint: disable=cell-var-from-loop
-                result = nn_ops.convolution(input=converted_input, filter=f1,
-                                            padding=padding)
-                result = nn_ops.convolution(input=result, filter=f2,
-                                            padding=padding)
+                result = nn_ops.convolution(
+                    input=converted_input, filter=f1, padding=padding)
+                result = nn_ops.convolution(
+                    input=result, filter=f2, padding=padding)
                 # pylint: enable=cell-var-from-loop
                 return result
 
               for rate_height in range(2, 4):
                 for rate_width in range(2, 4):
                   dilation_rate = [rate_height, rate_width]
-                  y1 = nn_ops.convolution(input=x, filter=f1, padding=padding,
-                                          dilation_rate=dilation_rate)
-                  y1 = nn_ops.convolution(input=y1, filter=f2,
-                                          padding=padding,
-                                          dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=x,
+                      filter=f1,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=y1,
+                      filter=f2,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
                   y2 = nn_ops.with_space_to_batch(
-                      input=x, dilation_rate=dilation_rate, op=combined_op,
+                      input=x,
+                      dilation_rate=dilation_rate,
+                      op=combined_op,
                       padding="VALID")
 
                   def check(y1_eval, y2_eval):
-                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2,
-                                        atol=1e-2)
+                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2, atol=1e-2)
+
                   add_check(check, y1, y2)
 
   def _test_gradient(self, x_shape, f_shape, dilation_rate, padding):
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index 88b3f20469a6a8d8e8181e8d5a3876ae22fb9c06..28b3dc45e9c5fd9aee0b4b7f71a5dc1b93c057ed 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -80,7 +80,7 @@ class RangeSamplerOpsTest(test.TestCase):
     with self.test_session():
       true_classes = constant_op.constant(
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
-      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(
+      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
       result = sampled_log_expected_count.eval()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 6e18ed132cd6337378fdb8ec774f7946da8d61ed..5d648bb235c8a8a0ae435e0c249bcb64ba787b08 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -181,8 +181,8 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(enter_v_constant.shape, [2])
 
       # Otherwise, the shape should be unknown.
-      enter_v_non_constant = control_flow_ops.enter(v, "frame2",
-                                                    is_constant=False)
+      enter_v_non_constant = control_flow_ops.enter(
+          v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
   def testSwitchMergeIndexedSlices(self):
@@ -736,24 +736,21 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
-      self.assertAllEqual([1+3, 2+3, 3+3, 4+3, 5+3], r.eval())
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
 
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.test_session():
       r = control_flow_ops.while_loop(
-          lambda i: i < 3,
-          lambda i: i + 1,
-          [0],
-          maximum_iterations=1)
+          lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
       self.assertEqual(1, r.eval())
 
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
+
     def training_loop_with_gradient(i):
       out = control_flow_ops.while_loop(
           lambda i_, _: i_ < 3,
-          lambda i_, j: [i_ + 1, j * v],
-          [0, 1.0],
+          lambda i_, j: [i_ + 1, j * v], [0, 1.0],
           maximum_iterations=i)
       g = gradients_impl.gradients(out, v)
       with ops.control_dependencies(g):
@@ -763,8 +760,8 @@ class ControlFlowTest(test.TestCase):
     xla_context.Enter()
     # Create training loop, ensure we can call gradient() of
     # while_loop inside the training loop.
-    loop = control_flow_ops.while_loop(
-        lambda i: i < 3, training_loop_with_gradient, [0])
+    loop = control_flow_ops.while_loop(lambda i: i < 3,
+                                       training_loop_with_gradient, [0])
     xla_context.Exit()
 
     loop_execute = array_ops.identity(loop)  # Because loop is not fetchable.
@@ -774,17 +771,18 @@ class ControlFlowTest(test.TestCase):
 
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
+
     def inner_body(i, x):
       out = control_flow_ops.while_loop(
           lambda i, _: i < 3,
-          lambda i, j: [i + 1, j * v],
-          [0, x],
+          lambda i, j: [i + 1, j * v], [0, x],
           maximum_iterations=i)
       return out
 
     def create_while_loop(maximum_iterations=None):
       return control_flow_ops.while_loop(
-          lambda i, _: i < 3, inner_body, [0, 1.0],
+          lambda i, _: i < 3,
+          inner_body, [0, 1.0],
           maximum_iterations=maximum_iterations)
 
     loop_no_xla = create_while_loop(maximum_iterations=5)
@@ -819,14 +817,17 @@ class ControlFlowTest(test.TestCase):
 
     def create_while_loop():
       max_iter_holder = []
+
       def create_mi():
         max_iter_holder.append(array_ops.placeholder(dtypes.int32, shape=()))
         return 1.0
-      _ = control_flow_ops.cond(constant_op.constant(True),
-                                create_mi, create_mi)
+
+      _ = control_flow_ops.cond(
+          constant_op.constant(True), create_mi, create_mi)
 
       return control_flow_ops.while_loop(
-          lambda i, _: i < 3, lambda i, x: (i + 1, v * x), (0, 1.0),
+          lambda i, _: i < 3,
+          lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
     xla_context = control_flow_ops.XLAControlFlowContext()
@@ -849,28 +850,32 @@ class ControlFlowTest(test.TestCase):
     p = array_ops.placeholder(dtype=dtypes.int32)
 
     def mid_body_builder(iterations):
+
       def mid_body(i, x):
         r = control_flow_ops.while_loop(
             lambda *_: True,
-            lambda i, x: (i + 1, v * x),
-            (0, x),
-            maximum_iterations=iterations, name="inner")
+            lambda i, x: (i + 1, v * x), (0, x),
+            maximum_iterations=iterations,
+            name="inner")
         return (i + 1, gradients_impl.gradients(x + r[1], v)[0])
+
       return mid_body
 
     def outer_body(i, x):
       iterations = array_ops.size(p, name="iterations")
-      return (
-          i + 1,
-          x + control_flow_ops.while_loop(
-              lambda *_: True, mid_body_builder(iterations), (0, x),
-              maximum_iterations=iterations, name="mid")[1])
+      return (i + 1, x + control_flow_ops.while_loop(
+          lambda *_: True,
+          mid_body_builder(iterations), (0, x),
+          maximum_iterations=iterations,
+          name="mid")[1])
 
     def create_while_loop():
       with ops.device("/cpu:0"):
         r = control_flow_ops.while_loop(
-            lambda *_: True, outer_body, (0, 1.0),
-            maximum_iterations=5, name="outer")
+            lambda *_: True,
+            outer_body, (0, 1.0),
+            maximum_iterations=5,
+            name="outer")
         return array_ops.identity(r[1])
 
     xla_context = control_flow_ops.XLAControlFlowContext()
@@ -881,18 +886,19 @@ class ControlFlowTest(test.TestCase):
     final_without_xla_context = create_while_loop()
 
     with self.test_session(use_gpu=False) as sess:
-      opts = config_pb2.RunOptions(
-          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
 
       final_value_without_xla_context = sess.run(
-          final_without_xla_context,
-          feed_dict={p: [0, 0, 0]})
+          final_without_xla_context, feed_dict={
+              p: [0, 0, 0]
+          })
 
       final_value_with_xla_context = sess.run(
           final_with_xla_context,
           feed_dict={p: [0, 0, 0]},
-          options=opts, run_metadata=run_metadata)
+          options=opts,
+          run_metadata=run_metadata)
 
       node_stats = run_metadata.step_stats.dev_stats[0].node_stats
       stack_push_count = len(
@@ -901,8 +907,8 @@ class ControlFlowTest(test.TestCase):
       # the last two "3"s comes from size(p), when p == [0, 0, 0].
       self.assertEqual(stack_push_count, 5 * 3 * 3)
 
-      self.assertAllClose(
-          final_value_with_xla_context, final_value_without_xla_context)
+      self.assertAllClose(final_value_with_xla_context,
+                          final_value_without_xla_context)
 
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
@@ -951,8 +957,7 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
 
       def compute(i, c, o):
-        c = array_ops.strided_slice(x,
-                                    array_ops.expand_dims(i, 0),
+        c = array_ops.strided_slice(x, array_ops.expand_dims(i, 0),
                                     [1] + array_ops.expand_dims(i, 0))
         o = array_ops.concat([o, c], 0)
         i = math_ops.add(i, 1)
@@ -963,11 +968,12 @@ class ControlFlowTest(test.TestCase):
       o = ops.convert_to_tensor([0])
       x = ops.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = array_ops.size(x)
-      r = control_flow_ops.while_loop(
-          lambda i, c, o: math_ops.less(i, s), compute, [i, c, o], [
-              i.get_shape(), tensor_shape.unknown_shape(),
-              tensor_shape.unknown_shape()
-          ])
+      r = control_flow_ops.while_loop(lambda i, c, o: math_ops.less(i, s),
+                                      compute, [i, c, o], [
+                                          i.get_shape(),
+                                          tensor_shape.unknown_shape(),
+                                          tensor_shape.unknown_shape()
+                                      ])
       result = r[2].eval()
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
@@ -1033,7 +1039,8 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, _b, [i, m], [i.get_shape(), tensor_shape.unknown_shape()])
+          c, _b, [i, m],
+          [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
       self.assertAllEqual(np.ones((8, 8)), r.eval())
 
@@ -1065,7 +1072,8 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, b, [i, m], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, m],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertTrue(r[1].get_shape()[0].value is None)
       self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
 
@@ -1092,20 +1100,22 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 1)
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None])])
       self.assertTrue(r.dense_shape.get_shape()[0].value is None)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([5])])
 
   def testWhileShapeInferenceIndexedSlices(self):
     with self.test_session():
@@ -1120,7 +1130,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -1128,14 +1139,16 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r.values.get_shape(), tensor_shape.TensorShape([2, 2]))
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
       self.assertTrue(r.values.get_shape()[0].value is None)
       self.assertEqual(r.values.get_shape()[1].value, 2)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
   def _testNestedWhile_1(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
@@ -1276,16 +1289,17 @@ class ControlFlowTest(test.TestCase):
           "v", [], initializer=init_ops.constant_initializer(2))
       i0 = constant_op.constant(0)
       with ops.control_dependencies([i0]):
+
         def loop_condition(i):
           return i < 4
 
         def loop_body(i):
           some_cond = control_flow_ops.cond(
               constant_op.constant(True),
-              lambda: state_ops.assign(v, math_ops.square(v)),
-              lambda: v)
+              lambda: state_ops.assign(v, math_ops.square(v)), lambda: v)
           with ops.control_dependencies([some_cond]):
             return i + 1
+
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
       self.assertEqual(4, r.eval())
@@ -1600,7 +1614,8 @@ class ControlFlowTest(test.TestCase):
 
       _, rx = control_flow_ops.while_loop(
           c1,
-          b1, [r, x], [r.get_shape(), tensor_shape.unknown_shape()],
+          b1, [r, x],
+          [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
       self.assertEqual(45, rx.eval())
 
@@ -1663,7 +1678,8 @@ class ControlFlowTest(test.TestCase):
       b = lambda i, v: [i + 1, math_ops.multiply(x, v)]
       r = control_flow_ops.while_loop(
           c,
-          b, [n, v], [n.get_shape(), tensor_shape.unknown_shape()],
+          b, [n, v],
+          [n.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
 
       r = gradients_impl.gradients(r[1], x)[0]
@@ -1797,8 +1813,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -1824,8 +1840,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -2176,7 +2192,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2197,8 +2214,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2220,8 +2237,8 @@ class ControlFlowTest(test.TestCase):
         x1 = x + gradients_impl.gradients(data, params)[0]
         return i + 1, x1
 
-      output_grad = control_flow_ops.while_loop(c, b,
-                                                [i0, constant_op.constant(0.0)])
+      output_grad = control_flow_ops.while_loop(
+          c, b, [i0, constant_op.constant(0.0)])
       self.assertAllClose(600.0, sess.run(output_grad)[1])
 
   def testWhileAndTensorArray(self):
@@ -2359,9 +2376,12 @@ class ControlFlowTest(test.TestCase):
 
   def testStopGradMultiFlows(self):
     with self.test_session():
+
       def body(i, y, r):
         x = variable_scope.get_variable(
-            "x", shape=(), dtype=dtypes.float32,
+            "x",
+            shape=(),
+            dtype=dtypes.float32,
             initializer=init_ops.ones_initializer())
         y *= x
         return [i + 1, y, r + math_ops.reduce_sum(y)]
@@ -2773,7 +2793,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           lambda i, v: i < 2, lambda i, v: [i + 1, func(v)],
           [constant_op.constant(0), x],
-          [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
+          [tensor_shape.unknown_shape(),
+           tensor_shape.unknown_shape()])
       self.assertEqual(r[1].eval(), 65536.0)
 
       r = gradients_impl.gradients(r, x)[0]
@@ -2800,12 +2821,14 @@ class ControlFlowContextCheckTest(test.TestCase):
 
   def _getCondTensor(self):
     cond_tensor = []
+
     def true_fn():
       if not cond_tensor:
         cond_tensor.append(constant_op.constant(1))
       return cond_tensor[0]
-    control_flow_ops.cond(math_ops.less(1, 2), true_fn,
-                          lambda: constant_op.constant(0))
+
+    control_flow_ops.cond(
+        math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
     return cond_tensor[0]
 
   def testInvalidContext(self):
@@ -2821,14 +2844,13 @@ class ControlFlowContextCheckTest(test.TestCase):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
     with self.assertRaisesRegexp(
-        ValueError,
-        "Cannot use 'while/Const_1' as input to 'cond/Add' because "
+        ValueError, "Cannot use 'while/Const_1' as input to 'cond/Add' because "
         "'while/Const_1' is in a while loop. See info log for more details."):
       # TODO(skyewm): this passes if we return while_tensor directly instead
       # of using it as input to another op.
-      control_flow_ops.cond(math_ops.less(1, 2),
-                            lambda: math_ops.add(1, while_tensor),
-                            lambda: constant_op.constant(0))
+      control_flow_ops.cond(
+          math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
+          lambda: constant_op.constant(0))
 
   def testInvalidContextInWhile(self):
     # Accessing a while loop tensor in a different while loop is illegal.
@@ -2856,6 +2878,7 @@ class ControlFlowContextCheckTest(test.TestCase):
     # Accessing a tensor from a cond context from the other branch's cond
     # context is OK (although dangerous).
     cond_tensor = []
+
     def branch_fn():
       if not cond_tensor:
         cond_tensor.append(constant_op.constant(1))
@@ -2892,12 +2915,13 @@ class ControlFlowContextCheckTest(test.TestCase):
       while_tensor = self._getWhileTensor()
       return control_flow_ops.while_loop(lambda i: i < 3,
                                          lambda i: i + while_tensor, [0])
+
     with self.assertRaisesRegexp(
         ValueError,
         "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
         " they are in different while loops. See info log for more details."):
-      control_flow_ops.cond(math_ops.less(1, 2), true_fn,
-                            lambda: constant_op.constant(0))
+      control_flow_ops.cond(
+          math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
 
 
 @test_util.with_c_api
@@ -3005,11 +3029,13 @@ class AssertTest(test.TestCase):
       sess.run(unguarded_assert, options=opts, run_metadata=unguarded_metadata)
       guarded_nodestat_names = [
           n.node_name
-          for d in guarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in guarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       unguarded_nodestat_names = [
           n.node_name
-          for d in unguarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in unguarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       guarded_memcpy_nodestat_names = [
           n for n in guarded_nodestat_names if "MEMCPYDtoH" in n
@@ -3066,6 +3092,7 @@ class WhileOpBenchmark(test.Benchmark):
     Returns:
       The duration of the run in seconds.
     """
+
     def loop_body(i, x):
       with ops.device("/gpu:0"):
         # Always put loop body on GPU.
@@ -3107,7 +3134,7 @@ class WhileOpBenchmark(test.Benchmark):
       start_time = time.time()
       for _ in xrange(num_iters):
         sess.run(r)
-      return (time.time() - start_time)/num_iters
+      return (time.time() - start_time) / num_iters
 
   def benchmarkWhileOpCrossDevicePlacement(self):
     iters = 10
@@ -3154,23 +3181,20 @@ class EagerTest(test.TestCase):
   def testWhileLoop(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-      self.assertAllEqual(isum(tensor).numpy(),
-                          [46, 47, 48, 49, 50])
+      self.assertAllEqual(isum(tensor).numpy(), [46, 47, 48, 49, 50])
 
   def testWhileLoopWithMaxIterations(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-      self.assertAllEqual(isum(tensor, maximum_iterations=3).numpy(),
-                          [1+3, 2+3, 3+3, 4+3, 5+3])
+      self.assertAllEqual(
+          isum(tensor, maximum_iterations=3).numpy(),
+          [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3])
 
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with context.eager_mode():
       tensor = constant_op.constant(0)
       r = control_flow_ops.while_loop(
-          lambda i: i < 3,
-          lambda i: i + 1,
-          [tensor],
-          maximum_iterations=1)
+          lambda i: i < 3, lambda i: i + 1, [tensor], maximum_iterations=1)
       self.assertEqual(1, r.numpy())
 
   def testWithDependencies(self):
@@ -3197,8 +3221,8 @@ class EagerTest(test.TestCase):
       f2 = lambda: constant_op.constant(23)
       f3 = lambda: constant_op.constant(-1)
 
-      r1 = control_flow_ops.case([(x < y, f1), (x > z, f2)],
-                                 default=f3, exclusive=True)
+      r1 = control_flow_ops.case(
+          [(x < y, f1), (x > z, f2)], default=f3, exclusive=True)
       self.assertAllEqual(r1.numpy(), 17)
 
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 3e9bd3dade6d08835780362cd73f5f01368e83ac..c5446326ba1763b962b32866235251d773e069f2 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -24,6 +24,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.contrib import layers
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index a91917b27faf46710d3f494b76929f4c7b9e9eec..0d9b46c30dbbed20dd940e0427fbf6f6d5415106 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -71,6 +71,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   return sparse_tensor.SparseTensor(
       indices=x_indices, values=x_values, dense_shape=x_shape), x_values
 
+
 def _default_tolerance(dtype):
   """Returns a sensible default tolerance for comparing results of a given
   type"""
@@ -81,7 +82,7 @@ def _default_tolerance(dtype):
   elif dtype in (np.float64, np.complex128):
     return 1e-5
   else:
-    return None # Fail fast for unexpected types
+    return None  # Fail fast for unexpected types
 
 
 class UnaryOpTest(test.TestCase):
@@ -233,10 +234,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(k, np.arccos, math_ops.acos)
     self._compareBoth(x, np.arctan, math_ops.atan)
     self._compareBoth(x, np.tan, math_ops.tan)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -298,8 +299,8 @@ class UnaryOpTest(test.TestCase):
     w = x - x.min() + 1.02  # all greater than 1
     y = (x + .5).astype(np.float64)  # no zero
     z = (x + 15.5).astype(np.float64)  # all positive
-    k = np.arange(-0.90, 0.90, 0.35).reshape(1, 3, 2).astype(
-        np.float64)  # between -1 and 1
+    k = np.arange(-0.90, 0.90,
+                  0.35).reshape(1, 3, 2).astype(np.float64)  # between -1 and 1
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
@@ -322,10 +323,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
     self._compareBoth(x, np.arctan, math_ops.atan)
@@ -362,10 +363,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -406,8 +407,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(x, np.sign, math_ops.sign)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex64)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex64)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -450,8 +451,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex128)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex128)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -805,10 +806,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.mod, _MOD)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex64)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex64)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex64)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex64)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -819,10 +820,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex128)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex128)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex128)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex128)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -1127,8 +1128,8 @@ class BinaryOpTest(test.TestCase):
 
   def testMismatchedDimensions(self):
     for func in [
-        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div,
-        _ADD, _SUB, _MUL, _TRUEDIV, _FLOORDIV
+        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD,
+        _SUB, _MUL, _TRUEDIV, _FLOORDIV
     ]:
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Dimensions must" in str(e)):
@@ -1161,8 +1162,8 @@ class BinaryOpTest(test.TestCase):
                    (1.2345, float("inf")), (1.2345, -float("inf")),
                    (-4.321, float("inf")), (-4.125, -float("inf")),
                    (float("inf"), float("inf")), (float("inf"), -float("inf")),
-                   (-float("inf"), float("inf")), (-float("inf"),
-                                                   -float("inf")))
+                   (-float("inf"), float("inf")),
+                   (-float("inf"), -float("inf")))
     for dtype in np.float32, np.float64:
       x1 = np.array(x1l).astype(dtype)
       x2 = np.array(x2l).astype(dtype)
@@ -1213,22 +1214,22 @@ class ComparisonOpTest(test.TestCase):
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
-          self.assertEqual(self._compareScalar(math_ops.less_equal, x, y, t),
-                           x <= y)
-          self.assertEqual(self._compareScalar(math_ops.greater, x, y, t),
-                           x > y)
+          self.assertEqual(
+              self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
+          self.assertEqual(
+              self._compareScalar(math_ops.greater, x, y, t), x > y)
           self.assertEqual(
               self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
@@ -1311,8 +1312,8 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(np.equal, math_ops.equal, include_complex=True)
 
   def testBCastNotEqual(self):
-    self._testBCastByFunc(np.not_equal, math_ops.not_equal,
-                          include_complex=True)
+    self._testBCastByFunc(
+        np.not_equal, math_ops.not_equal, include_complex=True)
 
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
@@ -1771,9 +1772,8 @@ class MathOpsOverloadTest(test.TestCase):
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
     with self.test_session(use_gpu=False):
-      self.assertAllClose(
-          np_ans, tf_func(ops.convert_to_tensor(
-              x, dtype=dtype)).eval())
+      self.assertAllClose(np_ans,
+                          tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval())
 
   def testOverload(self):
     dtypes = [
@@ -1795,8 +1795,8 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        if dtype in (dtypes_lib.complex64, dtypes_lib.complex128
-                    ) and tf_func == _FLOORDIV:
+        if dtype in (dtypes_lib.complex64,
+                     dtypes_lib.complex128) and tf_func == _FLOORDIV:
           continue  # floordiv makes no sense for complex
         self._compareBinary(10, 5, dtype, np_func, tf_func)
     # Mod only works for int32 and int64.
@@ -2008,7 +2008,8 @@ class ComplexMakeRealImagTest(test.TestCase):
     # self._compareAngle(cplx, use_gpu=True)
 
   def testRealReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
+                  dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.real(x)
       self.assertEqual(x, y)
@@ -2037,15 +2038,16 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareConj(cplx, use_gpu=True)
 
   def testConjReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
+                  dtypes_lib.float32, dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.conj(x)
       self.assertEqual(x, y)
 
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
-    with self.assertRaisesRegexp(
-        TypeError, r"Expected numeric or variant tensor"):
+    with self.assertRaisesRegexp(TypeError,
+                                 r"Expected numeric or variant tensor"):
       math_ops.conj(x)
 
   def _compareGradient(self, x):
@@ -2060,8 +2062,9 @@ class ComplexMakeRealImagTest(test.TestCase):
       real, imag = array_ops.reshape(real, [-1]), array_ops.reshape(imag, [-1])
       cplx = math_ops.complex(real, imag)
       cplx = math_ops.conj(cplx)
-      loss = math_ops.reduce_sum(math_ops.square(math_ops.real(
-          cplx))) + math_ops.reduce_sum(math_ops.square(math_ops.imag(cplx)))
+      loss = math_ops.reduce_sum(math_ops.square(
+          math_ops.real(cplx))) + math_ops.reduce_sum(
+              math_ops.square(math_ops.imag(cplx)))
       epsilon = 1e-3
       jacob_t, jacob_n = gradient_checker.compute_gradient(
           inx, list(x.shape), loss, [1], x_init_value=x, delta=epsilon)
@@ -2125,8 +2128,8 @@ class AccumulateTest(test.TestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = math_ops.accumulate_n(random_tensors)
       np_val = random_arrays[0]
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index ead55cd03b656a18d622b9d35c1b94f9cf2f5107..89fd26c544b5f2e8c15cec1b2d21a8c95fd503db 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import time
 
+from six.moves import xrange
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 6cfa9b37fe0e40f4f0e5e2ad2686819e5f6d4f12..0825d8fc6bea008532fd7428236dfb569f2a471e 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -84,11 +84,8 @@ class MatrixSetDiagTest(test.TestCase):
   def testSquare(self):
     with self.test_session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
-      mat = np.array([[0.0, 1.0, 0.0],
-                      [1.0, 0.0, 1.0],
-                      [1.0, 1.0, 1.0]])
-      mat_set_diag = np.array([[1.0, 1.0, 0.0],
-                               [1.0, 2.0, 1.0],
+      mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]])
+      mat_set_diag = np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0],
                                [1.0, 1.0, 3.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 3), output.get_shape())
@@ -135,19 +132,12 @@ class MatrixSetDiagTest(test.TestCase):
 
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[-1.0, -2.0],
-                          [-4.0, -5.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 3.0],
-            [0.0, 2.0, 0.0]],
-           [[4.0, 0.0, 4.0],
-            [0.0, 5.0, 0.0]]])
-
-      mat_set_diag_batch = np.array(
-          [[[-1.0, 0.0, 3.0],
-            [0.0, -2.0, 0.0]],
-           [[-4.0, 0.0, 4.0],
-            [0.0, -5.0, 0.0]]])
+      v_batch = np.array([[-1.0, -2.0], [-4.0, -5.0]])
+      mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
+                            [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]])
+
+      mat_set_diag_batch = np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
+                                     [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 2, 3), output.get_shape())
       self.assertAllEqual(mat_set_diag_batch, output.eval())
@@ -178,10 +168,14 @@ class MatrixSetDiagTest(test.TestCase):
             np.random.rand(*diag_shape), dtype=dtypes_lib.float32)
         y = array_ops.matrix_set_diag(x, x_diag)
         error_x = gradient_checker.compute_gradient_error(
-            x, x.get_shape().as_list(), y, y.get_shape().as_list())
+            x,
+            x.get_shape().as_list(), y,
+            y.get_shape().as_list())
         self.assertLess(error_x, 1e-4)
         error_x_diag = gradient_checker.compute_gradient_error(
-            x_diag, x_diag.get_shape().as_list(), y, y.get_shape().as_list())
+            x_diag,
+            x_diag.get_shape().as_list(), y,
+            y.get_shape().as_list())
         self.assertLess(error_x_diag, 1e-4)
 
   def testGradWithNoShapeInformation(self):
@@ -192,12 +186,13 @@ class MatrixSetDiagTest(test.TestCase):
       output = array_ops.matrix_set_diag(mat, v)
       grads = gradients_impl.gradients(output, [mat, v], grad_ys=grad_input)
       grad_input_val = np.random.rand(3, 3).astype(np.float32)
-      grad_vals = sess.run(grads,
-                           feed_dict={
-                               v: 2 * np.ones(3),
-                               mat: np.ones((3, 3)),
-                               grad_input: grad_input_val
-                           })
+      grad_vals = sess.run(
+          grads,
+          feed_dict={
+              v: 2 * np.ones(3),
+              mat: np.ones((3, 3)),
+              grad_input: grad_input_val
+          })
       self.assertAllEqual(np.diag(grad_input_val), grad_vals[1])
       self.assertAllEqual(grad_input_val - np.diag(np.diag(grad_input_val)),
                           grad_vals[0])
@@ -242,13 +237,9 @@ class MatrixDiagPartTest(test.TestCase):
 
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[1.0, 2.0],
-                          [4.0, 5.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 0.0],
-            [0.0, 2.0, 0.0]],
-           [[4.0, 0.0, 0.0],
-            [0.0, 5.0, 0.0]]])
+      v_batch = np.array([[1.0, 2.0], [4.0, 5.0]])
+      mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]],
+                            [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0]]])
       self.assertEqual(mat_batch.shape, (2, 2, 3))
       mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
       self.assertEqual((2, 2), mat_batch_diag.get_shape())
@@ -301,19 +292,13 @@ class DiagTest(test.TestCase):
 
   def testRankOneIntTensor(self):
     x = np.array([1, 2, 3])
-    expected_ans = np.array(
-        [[1, 0, 0],
-         [0, 2, 0],
-         [0, 0, 3]])
+    expected_ans = np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]])
     self.diagOp(x, np.int32, expected_ans)
     self.diagOp(x, np.int64, expected_ans)
 
   def testRankOneFloatTensor(self):
     x = np.array([1.1, 2.2, 3.3])
-    expected_ans = np.array(
-        [[1.1, 0, 0],
-         [0, 2.2, 0],
-         [0, 0, 3.3]])
+    expected_ans = np.array([[1.1, 0, 0], [0, 2.2, 0], [0, 0, 3.3]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
@@ -321,123 +306,105 @@ class DiagTest(test.TestCase):
     for dtype in [np.complex64, np.complex128]:
       x = np.array([1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j], dtype=dtype)
       expected_ans = np.array(
-          [[1.1 + 1.1j, 0 + 0j, 0 + 0j],
-           [0 + 0j, 2.2 + 2.2j, 0 + 0j],
-           [0 + 0j, 0 + 0j, 3.3 + 3.3j]], dtype=dtype)
+          [[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 2.2 + 2.2j, 0 + 0j],
+           [0 + 0j, 0 + 0j, 3.3 + 3.3j]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankTwoIntTensor(self):
     x = np.array([[1, 2, 3], [4, 5, 6]])
-    expected_ans = np.array(
-        [[[[1, 0, 0], [0, 0, 0]],
-          [[0, 2, 0], [0, 0, 0]],
-          [[0, 0, 3], [0, 0, 0]]],
-         [[[0, 0, 0], [4, 0, 0]],
-          [[0, 0, 0], [0, 5, 0]],
-          [[0, 0, 0], [0, 0, 6]]]])
+    expected_ans = np.array([[[[1, 0, 0], [0, 0, 0]], [[0, 2, 0], [0, 0, 0]],
+                              [[0, 0, 3], [0, 0, 0]]],
+                             [[[0, 0, 0], [4, 0, 0]], [[0, 0, 0], [0, 5, 0]],
+                              [[0, 0, 0], [0, 0, 6]]]])
     self.diagOp(x, np.int32, expected_ans)
     self.diagOp(x, np.int64, expected_ans)
 
   def testRankTwoFloatTensor(self):
     x = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
     expected_ans = np.array(
-        [[[[1.1, 0, 0], [0, 0, 0]],
-          [[0, 2.2, 0], [0, 0, 0]],
-          [[0, 0, 3.3], [0, 0, 0]]],
-         [[[0, 0, 0], [4.4, 0, 0]],
-          [[0, 0, 0], [0, 5.5, 0]],
-          [[0, 0, 0], [0, 0, 6.6]]]])
+        [[[[1.1, 0, 0], [0, 0, 0]], [[0, 2.2, 0], [0, 0, 0]],
+          [[0, 0, 3.3], [0, 0, 0]]], [[[0, 0, 0], [4.4, 0, 0]],
+                                      [[0, 0, 0], [0, 5.5, 0]], [[0, 0, 0],
+                                                                 [0, 0, 6.6]]]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
   def testRankTwoComplexTensor(self):
     for dtype in [np.complex64, np.complex128]:
-      x = np.array([[1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j],
-                    [4.4 + 4.4j, 5.5 + 5.5j, 6.6 + 6.6j]], dtype=dtype)
+      x = np.array(
+          [[1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j],
+           [4.4 + 4.4j, 5.5 + 5.5j, 6.6 + 6.6j]],
+          dtype=dtype)
       expected_ans = np.array(
-          [[[[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 2.2 + 2.2j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 3.3 + 3.3j], [0 + 0j, 0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j, 0 + 0j], [4.4 + 4.4j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 5.5 + 5.5j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 6.6 + 6.6j]]]],
-           dtype=dtype)
+          [[[[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]], [
+              [0 + 0j, 2.2 + 2.2j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]
+          ], [[0 + 0j, 0 + 0j, 3.3 + 3.3j], [0 + 0j, 0 + 0j, 0 + 0j]]], [[
+              [0 + 0j, 0 + 0j, 0 + 0j], [4.4 + 4.4j, 0 + 0j, 0 + 0j]
+          ], [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 5.5 + 5.5j, 0 + 0j]
+             ], [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 6.6 + 6.6j]]]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankThreeFloatTensor(self):
-    x = np.array([[[1.1, 2.2], [3.3, 4.4]],
-                  [[5.5, 6.6], [7.7, 8.8]]])
-    expected_ans = np.array(
-        [[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
-           [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
-          [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
-           [[[0, 0], [0, 4.4]], [[0, 0], [0, 0]]]]],
-         [[[[[0, 0], [0, 0]], [[5.5, 0], [0, 0]]],
-           [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
-          [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
-           [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
+    x = np.array([[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8]]])
+    expected_ans = np.array([[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
+                               [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
+                              [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
+                               [[[0, 0], [0, 4.4]], [[0, 0], [0, 0]]]]],
+                             [[[[[0, 0], [0, 0]], [[5.5, 0], [0, 0]]],
+                               [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
+                              [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
+                               [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
   def testRankThreeComplexTensor(self):
     for dtype in [np.complex64, np.complex128]:
-      x = np.array([[[1.1 + 1.1j, 2.2 + 2.2j], [3.3 + 3.3j, 4.4 + 4.4j]],
-                    [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
-                    dtype=dtype)
+      x = np.array(
+          [[[1.1 + 1.1j, 2.2 + 2.2j], [3.3 + 3.3j, 4.4 + 4.4j]],
+           [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
+          dtype=dtype)
       expected_ans = np.array(
-          [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]],
-            [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]]],
-           [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 6.6 + 6.6j], [0 + 0j, 0 + 0j]]]],
-            [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [7.7 + 7.7j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
+          [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]], [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]]], [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]], [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]]]], [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [
+              [5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]
+          ]], [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 6.6 + 6.6j], [
+              0 + 0j, 0 + 0j
+          ]]]], [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              7.7 + 7.7j, 0 + 0j
+          ]]], [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
+                [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
           dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankFourNumberTensor(self):
     for dtype in [np.float32, np.float64, np.int64, np.int32]:
       # Input with shape [2, 1, 2, 3]
-      x = np.array([[[[ 1,  2,  3],
-                      [ 4,  5,  6]]],
-                    [[[ 7,  8,  9],
-                      [10, 11, 12]]]], dtype=dtype)
+      x = np.array(
+          [[[[1, 2, 3], [4, 5, 6]]], [[[7, 8, 9], [10, 11, 12]]]], dtype=dtype)
       # Output with shape [2, 1, 2, 3, 2, 1, 2, 3]
       expected_ans = np.array(
-          [[[[[[[[1, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 2, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 3], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]]],
-             [[[[[0, 0, 0], [4, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 5, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 6]]],
-               [[[0, 0, 0], [0, 0, 0]]]]]]],
-
-           [[[[[[[0, 0, 0], [0, 0, 0]]],
-               [[[7, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 8, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 9], [0, 0, 0]]]]],
-             [[[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [10, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 11, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 12]]]]]]]], dtype=dtype)
+          [[[[[[[[1, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]], [
+              [[[0, 2, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 3], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]]], [[
+              [[[0, 0, 0], [4, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 5, 0]]], [[[0, 0, 0], [0, 0, 0]]]], [
+              [[[0, 0, 0], [0, 0, 6]]], [[[0, 0, 0], [0, 0, 0]]]
+          ]]]], [[[[[[[0, 0, 0], [0, 0, 0]]], [[[7, 0, 0], [0, 0, 0]]]], [
+              [[[0, 0, 0], [0, 0, 0]]], [[[0, 8, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 9], [0, 0, 0]]]]], [[
+              [[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [10, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 11, 0]]]
+             ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 12]]]]]]]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testInvalidRank(self):
@@ -537,7 +504,9 @@ class DiagGradOpTest(test.TestCase):
           x1 = constant_op.constant(np.random.rand(*shape), dtype=dtype)
           y = array_ops.diag(x1)
           error = gradient_checker.compute_gradient_error(
-              x1, x1.get_shape().as_list(), y, y.get_shape().as_list())
+              x1,
+              x1.get_shape().as_list(), y,
+              y.get_shape().as_list())
           tf_logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
@@ -555,7 +524,9 @@ class DiagGradPartOpTest(test.TestCase):
           x1 = constant_op.constant(np.random.rand(*shape), dtype=dtype)
           y = array_ops.diag_part(x1)
           error = gradient_checker.compute_gradient_error(
-              x1, x1.get_shape().as_list(), y, y.get_shape().as_list())
+              x1,
+              x1.get_shape().as_list(), y,
+              y.get_shape().as_list())
           tf_logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index cf723f5eec3c31c93d67fd6a34a21c8377b74c84..a4b30e4319527c6f3354ac83bf0e3a5114eb45e8 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -48,8 +48,10 @@ class DynamicStitchTestBase(object):
 
   def testShapeInferenceForScalarWithNonConstantIndices(self):
     with self.test_session(use_gpu=True):
-      indices = [array_ops.placeholder(dtype=dtypes.int32),
-                 constant_op.constant(1)]
+      indices = [
+          array_ops.placeholder(dtype=dtypes.int32),
+          constant_op.constant(1)
+      ]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
@@ -61,7 +63,8 @@ class DynamicStitchTestBase(object):
   def testSimpleOneDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6, 2, 3, 5])
       ]
       data = [
           constant_op.constant([0, 40, 70]),
@@ -86,7 +89,8 @@ class DynamicStitchTestBase(object):
   def testSimpleTwoDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6]),
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6]),
           constant_op.constant([2, 3, 5])
       ]
       data = [
@@ -104,7 +108,8 @@ class DynamicStitchTestBase(object):
   def testHigherRank(self):
     with self.test_session(use_gpu=True) as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
@@ -127,7 +132,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorIndicesMultiDimensional(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([[1, 6, 2, 3, 5]])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([[1, 6, 2, 3, 5]])
     ]
     data = [
         constant_op.constant([[0, 40, 70]]),
@@ -138,7 +144,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataNumDimsMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -149,7 +156,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataDimSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 5]), constant_op.constant([1, 6, 2, 3])
+        constant_op.constant([0, 4, 5]),
+        constant_op.constant([1, 6, 2, 3])
     ]
     data = [
         constant_op.constant([[0], [40], [70]]),
@@ -160,7 +168,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataAndIndicesSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -235,13 +244,15 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
   def testHigherRankGPU(self):
     with self.test_session() as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
           constant_op.constant([61, 62], dtype=dtypes.float32),
           constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+          constant_op.constant(
+              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
       stitched_val = stitched_t.eval()
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 5c7624f1f6be4da91ca74d4ef2ed81a21890b35c..6ea9f1badc3b8fac06fe6328f95714b93de97c0e 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -84,7 +84,7 @@ class ExtractImagePatches(test.TestCase):
           patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Valid(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with VALID padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 1, 1, 4]
@@ -98,7 +98,7 @@ class ExtractImagePatches(test.TestCase):
         patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Same(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with SAME padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 2, 2, 4]
@@ -111,6 +111,20 @@ class ExtractImagePatches(test.TestCase):
         padding="SAME",
         patches=patches)
 
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index f91875c6f0c1a7bfa388ec1b1a58f06b65889c3e..61944f7e3197844d00cbc001459e48b50c9003b4 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -1,4 +1,4 @@
-﻿# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 81af3a0887d09a7736a145a5b3c99c9391691724..f1fbe1a745bcc851154121e6e2123b92bba6fec1 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -953,14 +953,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     # Compute the expected loss 'manually'.
     total = np.zeros((batch_size,))
     for b in range(batch_size):
-      for i in range(dims):
-        for j in range(dims):
+      for i in range(dims-1):
+        for j in range(i+1, dims):
           x = self._predictions[b, i].item() - self._predictions[b, j].item()
           y = self._labels[b, i].item() - self._labels[b, j].item()
           diff = (x - y)
           total[b] += (diff * diff)
 
-    self._expected_losses = np.divide(total, 9.0)
+    self._expected_losses = np.divide(total, 3.0)
 
   def testValueErrorThrownWhenWeightIsNone(self):
     with self.test_session():
@@ -1060,7 +1060,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
     self._test_valid_weights(
-        labels, predictions, expected_loss=122.22222)
+        labels, predictions, expected_loss=137.5)
 
   def test3dWeightedScalar(self):
     labels = np.array([
@@ -1073,7 +1073,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     weight = 3.0
     self._test_valid_weights(
-        labels, predictions, expected_loss=weight * 122.22222,
+        labels, predictions, expected_loss=weight * 137.5,
         weights=weight)
 
   def _test_invalid_weights(
@@ -1124,7 +1124,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(
         # TODO(ptucker): This doesn't look right.
-        labels, predictions, expected_loss=9 * 122.22222,
+        labels, predictions, expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
   def testLossWithAllZeroBatchSpecificWeights(self):
@@ -1345,6 +1345,34 @@ class ComputeWeightedLossTest(test.TestCase):
               self.assertAllClose(
                   np.mean(self._raw_losses), unweighted_loss.eval())
 
+  def testUnweightedFromPlaceholder(self):
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        raw_losses = array_ops.placeholder(dtype=dtypes.float32)
+        feed_dict = {raw_losses: self._raw_losses}
+        unweighted_losses = (
+            losses.compute_weighted_loss(raw_losses, reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction),
+        )
+        self.assertEqual(3, len(util.get_losses()))
+        with self.test_session(g):
+          for unweighted_loss in unweighted_losses:
+            if reduction == losses.Reduction.NONE:
+              self.assertAllClose(
+                  self._raw_losses, unweighted_loss.eval(feed_dict))
+            elif reduction == losses.Reduction.SUM:
+              self.assertAllClose(
+                  np.sum(self._raw_losses), unweighted_loss.eval(feed_dict))
+            else:
+              # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
+              # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
+              self.assertAllClose(
+                  np.mean(self._raw_losses), unweighted_loss.eval(feed_dict))
+
   def testScalarWeight(self):
     with ops.Graph().as_default():
       self.assertEqual(0, len(util.get_losses()))
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3044b21aa4271cabda2a174a85b6db16fe13f93b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -0,0 +1,137 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for manip_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import manip_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.platform import test as test_lib
+
+import numpy as np
+
+# pylint: disable=g-import-not-at-top
+try:
+  from distutils.version import StrictVersion as Version
+  # numpy.roll for multiple shifts was introduced in numpy version 1.12.0
+  NP_ROLL_CAN_MULTISHIFT = Version(np.version.version) >= Version('1.12.0')
+except ImportError:
+  NP_ROLL_CAN_MULTISHIFT = False
+# pylint: enable=g-import-not-at-top
+
+class RollTest(test_util.TensorFlowTestCase):
+  def _testRoll(self, np_input, shift, axis):
+    expected_roll = np.roll(np_input, shift, axis)
+    with self.test_session():
+      roll = manip_ops.roll(np_input, shift, axis)
+      self.assertAllEqual(roll.eval(), expected_roll)
+
+  def _testGradient(self, np_input, shift, axis):
+    with self.test_session():
+      inx = constant_op.constant(np_input.tolist())
+      xs = list(np_input.shape)
+      y = manip_ops.roll(inx, shift, axis)
+      # Expected y's shape to be the same
+      ys = xs
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, xs, y, ys, x_init_value=np_input)
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _testAll(self, np_input, shift, axis):
+    self._testRoll(np_input, shift, axis)
+    if np_input.dtype == np.float32:
+      self._testGradient(np_input, shift, axis)
+
+  def testIntTypes(self):
+    for t in [np.int32, np.int64]:
+      self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.randint(-100, 100, (4, 4, 3)).astype(t),
+                      [1, -2, 3], [0, 1, 2])
+        self._testAll(np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
+                      [0, 1, -2], [1, 2, 3])
+
+  def testFloatTypes(self):
+    for t in [np.float32, np.float64]:
+      self._testAll(np.random.rand(5).astype(t), 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
+        self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
+
+  def testComplexTypes(self):
+    for t in [np.complex64, np.complex128]:
+      x = np.random.rand(4, 4).astype(t)
+      self._testAll(x + 1j * x, 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        x = np.random.rand(2, 5).astype(t)
+        self._testAll(x + 1j * x, [1, 2], [1, 0])
+        x = np.random.rand(3, 2, 1, 1).astype(t)
+        self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
+
+
+  def testRollInputMustVectorHigherRaises(self):
+    tensor = 7
+    shift = 1
+    axis = 0
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "input must be 1-D or higher"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = 1
+    axis = [[0, 1]]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "axis must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = [[0, 1]]
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = [1]
+    axis = [0, 1]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift and axis must have the same size"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisOutOfRangeRaises(self):
+    tensor = [1, 2]
+    shift = 1
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index 8b669450590f1fce0f14a9e5d64e1055dbe23f4e..acfafde9e0f74d4e3ad6f2ee8ada9da3df94f5b9 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.platform import test
 
 TIMEOUT = 1
 
+
 class MapStageTest(test.TestCase):
 
   def testSimple(self):
@@ -83,7 +84,7 @@ class MapStageTest(test.TestCase):
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
             names=['x', 'v'])
-        stage = stager.put(pi,{'x': x, 'v': v})
+        stage = stager.put(pi, {'x': x, 'v': v})
         key, ret = stager.get(gi)
         z = ret['x']
         y = ret['v']
@@ -128,8 +129,11 @@ class MapStageTest(test.TestCase):
         gi = array_ops.placeholder(dtypes.int64)
         p = array_ops.placeholder(dtypes.int32, name='p')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ], shapes=[[]])
-        stage = stager.put(pi,[x], [0])
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]])
+        stage = stager.put(pi, [x], [0])
         peek = stager.peek(gi)
         size = stager.size()
 
@@ -158,7 +162,7 @@ class MapStageTest(test.TestCase):
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
             names=['x', 'v'])
-        stage = stager.put(pi,{'x': x, 'v': v})
+        stage = stager.put(pi, {'x': x, 'v': v})
         size = stager.size()
         clear = stager.clear()
 
@@ -172,7 +176,6 @@ class MapStageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
-
   def testCapacity(self):
     capacity = 3
 
@@ -182,8 +185,10 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
-          capacity=capacity, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], capacity=capacity, shapes=[[]])
 
       stage = stager.put(pi, [x], [0])
       get = stager.get()
@@ -222,9 +227,8 @@ class MapStageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -236,8 +240,8 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
-    memory_limit = 512*1024  # 512K
-    chunk = 200*1024 # 256K
+    memory_limit = 512 * 1024  # 512K
+    chunk = 200 * 1024  # 256K
     capacity = memory_limit // chunk
 
     with ops.Graph().as_default() as G:
@@ -246,8 +250,8 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.uint8],
-          memory_limit=memory_limit, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.uint8], memory_limit=memory_limit, shapes=[[]])
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
@@ -287,9 +291,8 @@ class MapStageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -310,8 +313,10 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
-          shapes=[[]], ordered=True)
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]], ordered=True)
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
@@ -349,7 +354,7 @@ class MapStageTest(test.TestCase):
         stager = data_flow_ops.MapStagingArea(
             [dtypes.float32, dtypes.float32, dtypes.float32],
             names=['x', 'v', 'f'])
-        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_xf = stager.put(pi, {'x': x, 'f': f})
         stage_v = stager.put(pi, {'v': v})
         key, ret = stager.get(gi)
         size = stager.size()
@@ -373,12 +378,13 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 1])
       # We can now obtain tuple associated with key 0
       self.assertTrue(
-          sess.run([key, ret],
-                   feed_dict={gi: 0}) == [0, {
-                       'x': 1,
-                       'f': 2,
-                       'v': 1
-                   }])
+          sess.run([key, ret], feed_dict={
+              gi: 0
+          }) == [0, {
+              'x': 1,
+              'f': 2,
+              'v': 1
+          }])
 
       # 0 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [0, 1])
@@ -386,12 +392,13 @@ class MapStageTest(test.TestCase):
       sess.run(stage_v, feed_dict={pi: 1, v: 3})
       # We can now obtain tuple associated with key 1
       self.assertTrue(
-          sess.run([key, ret],
-                   feed_dict={gi: 1}) == [1, {
-                       'x': 1,
-                       'f': 2,
-                       'v': 3
-                   }])
+          sess.run([key, ret], feed_dict={
+              gi: 1
+          }) == [1, {
+              'x': 1,
+              'f': 2,
+              'v': 3
+          }])
 
   def testPartialIndexInsert(self):
     with ops.Graph().as_default() as G:
@@ -450,7 +457,7 @@ class MapStageTest(test.TestCase):
         stager = data_flow_ops.MapStagingArea(
             [dtypes.float32, dtypes.float32, dtypes.float32],
             names=['x', 'v', 'f'])
-        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_xf = stager.put(pi, {'x': x, 'f': f})
         stage_v = stager.put(pi, {'v': v})
         peek_xf = stager.peek(pei, ['x', 'f'])
         peek_v = stager.peek(pei, ['v'])
@@ -487,11 +494,12 @@ class MapStageTest(test.TestCase):
 
       # We can now obtain 'x' and 'f' values associated with key 0
       self.assertTrue(
-          sess.run([key_xf, get_xf],
-                   feed_dict={gi: 0}) == [0, {
-                       'x': 1,
-                       'f': 2
-                   }])
+          sess.run([key_xf, get_xf], feed_dict={
+              gi: 0
+          }) == [0, {
+              'x': 1,
+              'f': 2
+          }])
       # Still have 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 1])
 
@@ -499,14 +507,15 @@ class MapStageTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError) as cm:
         sess.run([key_xf, get_xf], feed_dict={gi: 0})
 
-      exc_str = ("Tensor at index '0' for key '0' "
-                "has already been removed.")
+      exc_str = ("Tensor at index '0' for key '0' " 'has already been removed.')
 
       self.assertTrue(exc_str in cm.exception.message)
 
       # Obtain 'v' value associated with key 0
       self.assertTrue(
-          sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, {
+          sess.run([key_v, get_v], feed_dict={
+              gi: 0
+          }) == [0, {
               'v': 1
           }])
       # 0 complete and 1 incomplete entry
@@ -523,7 +532,9 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 0])
       # We can now obtain 'x' and 'f' values associated with key 1
       self.assertTrue(
-          sess.run([pop_key_v, pop_v], feed_dict={pi: 1}) == [1, {
+          sess.run([pop_key_v, pop_v], feed_dict={
+              pi: 1
+          }) == [1, {
               'v': 1
           }])
       # Nothing is left
@@ -557,18 +568,20 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 0])
 
       # Partial get using indices
-      self.assertTrue(sess.run([key_xf, get_xf],
-            feed_dict={gi: 0}) == [0, [1, 2]])
+      self.assertTrue(
+          sess.run([key_xf, get_xf], feed_dict={
+              gi: 0
+          }) == [0, [1, 2]])
 
       # Still some of key 0 left
       self.assertTrue(sess.run([size, isize]) == [1, 0])
 
       # Partial get of remaining index
-      self.assertTrue(sess.run([key_v, get_v],
-            feed_dict={gi: 0}) == [0, [3]])
+      self.assertTrue(sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, [3]])
 
       # All gone
       self.assertTrue(sess.run([size, isize]) == [0, 0])
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 317b8dc05beac7642c384bf89e6d154be50f6992..68d626de2c5cdd91ee332247c05ddce2a558a35e 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -54,9 +55,13 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
           band_np = np.tril(band_np, upper)
         if batch_shape_ is not ():
           band_np = np.tile(band_np, batch_shape_ + (1, 1))
-        with self.test_session(use_gpu=False):
-          band = array_ops.matrix_band_part(batch_mat, lower, upper)
-          self.assertAllEqual(band_np, band.eval())
+        for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]:
+          with self.test_session(use_gpu=False):
+            band = array_ops.matrix_band_part(
+                batch_mat,
+                constant_op.constant(lower, index_dtype),
+                constant_op.constant(upper, index_dtype))
+            self.assertAllEqual(band_np, band.eval())
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index 56a07cb012f08dec750c5ee18cc73b3b127ef5dd..f5c6255c346961fec7245889229ea1c4b89fa388 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -50,8 +50,7 @@ class PartitionerCreatorsTest(test.TestCase):
     with self.test_session():
       partitioner = partitioned_variables.fixed_size_partitioner(4, axis=0)
       with variable_scope.variable_scope("root", partitioner=partitioner):
-        v0 = variable_scope.get_variable(
-            "v0", dtype=dtypes.int64, shape=[20])
+        v0 = variable_scope.get_variable("v0", dtype=dtypes.int64, shape=[20])
         v0_list = v0._get_variable_list()
         self.assertEqual(len(v0_list), 4)
 
@@ -169,8 +168,10 @@ class PartitionerCreatorsTest(test.TestCase):
           max_shards=2)
 
       # Use the partitioner with strings
-      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(
-          axis=3, max_shard_bytes=32768, bytes_per_string_element=8)
+      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(  # pylint: disable=line-too-long
+          axis=3,
+          max_shard_bytes=32768,
+          bytes_per_string_element=8)
 
       with variable_scope.variable_scope(
           "root", partitioner=partitioner_axis3_str):
@@ -423,8 +424,7 @@ class PartitionedVariablesTestCase(test.TestCase):
   def testRandomInitUnevenPartitions(self):
     with self.test_session():
       rnd = variables.Variable(
-          random_ops.random_uniform(
-              [20, 43], dtype=dtypes.float64))
+          random_ops.random_uniform([20, 43], dtype=dtypes.float64))
       var_lists = [
           partitioned_variables.create_partitioned_variables(
               rnd.get_shape(), [1, i], rnd.initialized_value())
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 5c0ea8ec8edbd1a1f523630f61afbe28adf77a19..4466beeec96509b3761e34d885276e1510c62d10 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -159,8 +159,10 @@ class PoolingTest(test.TestCase):
       elif data_format == "NCHW":
         t = test_util.NCHWToNHWC(t)
       if v2:
-        actual = t.eval(feed_dict={ksize_placeholder: ksize,
-                                   strides_placeholder: strides})
+        actual = t.eval(feed_dict={
+            ksize_placeholder: ksize,
+            strides_placeholder: strides
+        })
       else:
         actual = t.eval()
         self.assertShapeEqual(actual, t)
@@ -195,8 +197,15 @@ class PoolingTest(test.TestCase):
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
                           data_format, dtypes.float16, expected, use_gpu, v2)
 
-  def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding,
-                    expected, use_gpu, v2=False):
+  def _VerifyValues(self,
+                    pool_func,
+                    input_sizes,
+                    ksize,
+                    strides,
+                    padding,
+                    expected,
+                    use_gpu,
+                    v2=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -1148,16 +1157,16 @@ class PoolingTest(test.TestCase):
   def _testMaxPoolGradSamePadding3_1(self, data_format, use_gpu):
     for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
-        pool_func,
-        input_sizes=[1, 7, 7, 1],
-        output_sizes=[1, 7, 7, 1],
-        window_rows=3,
-        window_cols=3,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 7, 7, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1202,17 +1211,14 @@ class PoolingTest(test.TestCase):
     pool_func = gen_nn_ops._max_pool_v2 if v2 else nn_ops.max_pool
     with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(input_data, shape=input_sizes)
-      output_tensor = pool_func(input_tensor,
-                                [1, window_rows, window_cols, 1],
+      output_tensor = pool_func(input_tensor, [1, window_rows, window_cols, 1],
                                 [1, row_stride, col_stride, 1], padding)
       output_backprop_tensor = constant_op.constant(
           output_backprop, shape=output_sizes)
 
-      input_backprop_tensor = self._MaxPoolGrad(input_tensor, output_tensor,
-                                                output_backprop_tensor,
-                                                window_rows, window_cols,
-                                                row_stride, col_stride,
-                                                padding, v2)
+      input_backprop_tensor = self._MaxPoolGrad(
+          input_tensor, output_tensor, output_backprop_tensor, window_rows,
+          window_cols, row_stride, col_stride, padding, v2)
 
       actual_input_backprop = input_backprop_tensor.eval()
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
@@ -1414,13 +1420,15 @@ class PoolingTest(test.TestCase):
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
-        float("nan"), 12.0, 13.0, 15.0, float("nan"), 17.0, 19.0, 20.0,
+        float("nan"), 12.0, 13.0, 15.0,
+        float("nan"), 17.0, 19.0, 20.0,
         float("nan")
     ]
     # Test the CPU implementation, which propagates diffs in case of NaN
     expected_input_backprop_tf_cpu = [
-        float("nan"), 12.0, 13.0, 0.0, 15.0, float("nan"), 17.0, 0.0, 19.0,
-        20.0, float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0
+        float("nan"), 12.0, 13.0, 0.0, 15.0,
+        float("nan"), 17.0, 0.0, 19.0, 20.0,
+        float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0
     ]
     for v2 in [True, False]:
       self._testMaxPoolGradDirect(
@@ -1636,10 +1644,9 @@ class PoolingTest(test.TestCase):
     Returns:
       A Tensor.
     """
-    return gen_nn_ops._max_pool_grad_grad(orig_input, orig_output, grad,
-                                          [1, window_rows, window_cols,
-                                           1], [1, row_stride, col_stride,
-                                                1], padding)
+    return gen_nn_ops._max_pool_grad_grad(
+        orig_input, orig_output, grad, [1, window_rows, window_cols, 1],
+        [1, row_stride, col_stride, 1], padding)
 
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1793,8 +1800,7 @@ class PoolingTest(test.TestCase):
     ]:
       with self.assertRaises(ValueError):
         pool_func(
-            array_ops.placeholder(
-                dtypes.float32, shape=[1, 3]),
+            array_ops.placeholder(dtypes.float32, shape=[1, 3]),
             ksize=[1, 1, 1, 1],
             strides=[1, 1, 1, 1],
             padding="SAME")
@@ -1805,30 +1811,29 @@ class PoolingTest(test.TestCase):
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
       for pool_func in pool_funcs:
-        # Illegal strides.
-        with self.assertRaisesRegexp(
-            errors_impl.UnimplementedError,
-            "Pooling is not yet supported on the batch"):
-          sess.run(
-              pool_func(
-                  array_ops.placeholder(dtypes.float32),
-                  ksize=[1, 1, 1, 1],
-                  strides=[2, 1, 1, 1],
-                  padding="SAME"))
+        if pool_func != nn_ops.max_pool:
+          # Illegal strides.
+          with self.assertRaisesRegexp(
+              errors_impl.UnimplementedError,
+              "Pooling is not yet supported on the batch"):
+            sess.run(
+                pool_func(
+                    array_ops.placeholder(dtypes.float32),
+                    ksize=[1, 1, 1, 1],
+                    strides=[2, 1, 1, 1],
+                    padding="SAME"))
 
         # Filter too large.
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
           sess.run(
               pool_func(
-                  array_ops.placeholder(
-                      dtypes.float32, shape=[32, 20, 20, 3]),
+                  array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
                   ksize=[1, 20, 21, 1],
                   strides=[1, 1, 1, 1],
                   padding="VALID"))
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
           pool_func(
-              array_ops.placeholder(
-                  dtypes.float32, shape=[32, 20, 20, 3]),
+              array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
               ksize=[1, 21, 20, 1],
               strides=[1, 1, 1, 1],
               padding="VALID")
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 92fb68820e04c3db1385296d91d956134b8ff2d4..c7181497d891f6d35a788c90bf59a0ce5a536328 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -396,66 +396,66 @@ class PyFuncTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerSingleOutputFloat32(self):
-    a = array_ops.ones((3, 3), dtype=dtypes.float32)
-    x = array_ops.ones((3, 1), dtype=dtypes.float32)
-    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
-    with self.test_session():
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
       ret = self.evaluate(output)
       self.assertAllClose(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerArrayOutput(self):
-    a = array_ops.ones((3, 3), dtype=dtypes.int32)
-    x = array_ops.ones((3, 1), dtype=dtypes.int32)
-    output = script_ops.eager_py_func(
-        lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.int32])
-
-    with self.test_session():
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(
+          lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.float32])
       ret = self.evaluate(output)
-      self.assertAllEqual(ret, [[[3], [3], [3]]])
+      self.assertAllEqual(ret, [[[3.0], [3.0], [3.0]]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerReturnNone(self):
+    with test_util.device(use_gpu=True):
+      def no_return_value():
+        return
 
-    def no_return_value():
-      return
-
-    output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
-    ret = self.evaluate(output)
-    if context.in_eager_mode():
-      self.assertEquals(len(ret), 0)
-    else:
-      self.assertIsNone(ret)
+      output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
+      ret = self.evaluate(output)
+      if context.in_eager_mode():
+        self.assertEquals(len(ret), 0)
+      else:
+        self.assertIsNone(ret)
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerPyFuncInDefun(self):
+    with test_util.device(use_gpu=True):
+      def wrapper():
+        a = array_ops.ones((3, 3), dtype=dtypes.float32)
+        x = array_ops.ones((3, 1), dtype=dtypes.float32)
+        return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
 
-    def wrapper():
-      a = array_ops.ones((3, 3), dtype=dtypes.int32)
-      x = array_ops.ones((3, 1), dtype=dtypes.int32)
-      return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
-
-    wrapped = function.defun(wrapper)
-    ret = self.evaluate(wrapped())
-    self.assertAllEqual(ret, [[3], [3], [3]])
+      wrapped = function.defun(wrapper)
+      ret = self.evaluate(wrapped())
+      self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerExceptionHandling(self):
-    self._testExceptionHandling(
-        ValueError, errors.InvalidArgumentError, eager=True)
-    self._testExceptionHandling(
-        TypeError, errors.InvalidArgumentError, eager=True)
-    self._testExceptionHandling(
-        StopIteration, errors.OutOfRangeError, eager=True)
-    self._testExceptionHandling(
-        MemoryError, errors.ResourceExhaustedError, eager=True)
-    self._testExceptionHandling(
-        NotImplementedError, errors.UnimplementedError, eager=True)
-
-    class WeirdError(Exception):
-      pass
-
-    self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
+    with test_util.device(use_gpu=True):
+      self._testExceptionHandling(
+          ValueError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          TypeError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          StopIteration, errors.OutOfRangeError, eager=True)
+      self._testExceptionHandling(
+          MemoryError, errors.ResourceExhaustedError, eager=True)
+      self._testExceptionHandling(
+          NotImplementedError, errors.UnimplementedError, eager=True)
+
+      class WeirdError(Exception):
+        pass
+
+      self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 5a2903a4234202c828168b6538baf320b961c776..df37dd98ece57ae7c3835ab63b720b29fc19c975 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -203,7 +203,8 @@ class RandomUniformTest(test.TestCase):
     return func
 
   def testRange(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       sampler = self._Sampler(1000, minv=-2, maxv=8, dtype=dt, use_gpu=True)
       x = sampler()
       self.assertTrue(-2 <= np.min(x))
@@ -213,7 +214,8 @@ class RandomUniformTest(test.TestCase):
   # to see the same sequence of values. Will catch buggy
   # implementations which uses the same random number seed.
   def testDistinct(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 1 << 30
       sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt, use_gpu=True)
       x = sampler()
@@ -251,7 +253,8 @@ class RandomUniformTest(test.TestCase):
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
   def testCPUGPUMatch(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 17
       results = {}
       for use_gpu in False, True:
@@ -261,7 +264,8 @@ class RandomUniformTest(test.TestCase):
       self.assertAllEqual(results[False], results[True])
 
   def testSeed(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       for seed in [345, 2**100, -2**100]:
         sx = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
         sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
@@ -285,8 +289,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.truncated_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32))
@@ -298,8 +301,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32))
@@ -311,8 +313,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_uniform(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_uniform(array_ops.placeholder(dtypes.int32))
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 223a4b2c8726d957f014e65ea9f87c0fb61e65bb..82a27eebeef16c9dacaf1b900f0398a56533cd2d 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -428,7 +428,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     for i in range(self._num_files):
       fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn+".tmp", "wb") as f:
+      with open(fn + ".tmp", "wb") as f:
         f.write(b"H" * self._header_bytes)
         if num_records > 0:
           f.write(self._Record(i, 0))
@@ -437,7 +437,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
             f.write(b"G" * gap_bytes)
           f.write(self._Record(i, j))
         f.write(b"F" * self._footer_bytes)
-      with open(fn+".tmp", "rb") as f:
+      with open(fn + ".tmp", "rb") as f:
         cdata = zlib.compress(f.read())
         with open(fn, "wb") as zf:
           zf.write(cdata)
@@ -455,7 +455,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           all_records_str = "".join([
               str(i)[0]
               for i in range(self._record_bytes + self._hop_bytes *
-                           (num_overlapped_records - 1))
+                             (num_overlapped_records - 1))
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
@@ -467,7 +467,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
       fn = os.path.join(self.get_temp_dir(),
                         "fixed_length_overlapped_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn+".tmp", "wb") as f:
+      with open(fn + ".tmp", "wb") as f:
         f.write(b"H" * self._header_bytes)
         if num_overlapped_records > 0:
           all_records_str = "".join([
@@ -477,7 +477,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
-      with open(fn+".tmp", "rb") as f:
+      with open(fn + ".tmp", "rb") as f:
         cdata = zlib.compress(f.read())
         with open(fn, "wb") as zf:
           zf.write(cdata)
@@ -509,7 +509,10 @@ class FixedLengthRecordReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
-  def _TestOneEpochWithHopBytes(self, files, num_overlapped_records, encoding=None):
+  def _TestOneEpochWithHopBytes(self,
+                                files,
+                                num_overlapped_records,
+                                encoding=None):
     with self.test_session() as sess:
       reader = io_ops.FixedLengthRecordReader(
           header_bytes=self._header_bytes,
@@ -565,13 +568,15 @@ class FixedLengthRecordReaderTest(test.TestCase):
 
   def testGzipOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
-      files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records, )
-      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="GZIP")
+      files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records,)
+      self._TestOneEpochWithHopBytes(
+          files, num_overlapped_records, encoding="GZIP")
 
   def testZlibOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateZlibOverlappedRecordFiles(num_overlapped_records)
-      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="ZLIB")
+      self._TestOneEpochWithHopBytes(
+          files, num_overlapped_records, encoding="ZLIB")
 
 
 class TFRecordReaderTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index dd11ba700d518ab230c1160d17f4cc0833a79198..6b4091ae5d3c6e469a9cd5237b978eae4c75485f 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -48,8 +48,8 @@ class ReluTest(test.TestCase):
     self.assertAllClose(
         np.array([[0.0, 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]),
         self._npRelu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testRelu(self, np_features, use_gpu=False):
     np_relu = self._npRelu(np_features)
@@ -163,8 +163,8 @@ class Relu6Test(test.TestCase):
     self.assertAllClose(
         np.array([[0.0, 0.7, 0.0, 0.3, 6.0], [0.1, 0.0, 6.0, 0.0, 0.9]]),
         self._npRelu6(
-            np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7,
+                                                    0.9]])))
 
   def _testRelu6(self, np_features, use_gpu=False):
     np_relu6 = self._npRelu6(np_features)
@@ -231,8 +231,8 @@ class EluTest(test.TestCase):
         np.array([[-0.59343034025, 0.7, -0.39346934028, 0.3, -0.09516258196],
                   [0.1, -0.25918177931, 0.5, -0.5034146962, 0.9]]),
         self._npElu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testElu(self, np_features, use_gpu=False):
     np_elu = self._npElu(np_features)
@@ -330,11 +330,11 @@ class SeluTest(test.TestCase):
 
   def testNpSelu(self):
     self.assertAllClose(
-        np.array([[-1.0433095, 0.73549069, -0.6917582, 0.3152103 , -0.16730527],
-                 [0.1050701 , -0.45566732, 0.5253505, -0.88505305, 0.9456309]]),
+        np.array([[-1.0433095, 0.73549069, -0.6917582, 0.3152103, -0.16730527],
+                  [0.1050701, -0.45566732, 0.5253505, -0.88505305, 0.9456309]]),
         self._npSelu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testSelu(self, np_features, use_gpu=False):
     np_selu = self._npSelu(np_features)
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 7b131a5b8ca46cc205ec29d5a48cd704b1c67b04..b4b555591d054226210eb6af20036967b240928f 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   def tearDown(self):
@@ -342,14 +343,14 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(
           2.0, caching_device="/job:localhost")
       self.assertEqual("/job:localhost", v.value().device)
-      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+      with self.assertRaises(ValueError):
         _ = v.value().op.get_attr("_class")
 
     with ops.colocate_with(v.op):
       w = resource_variable_ops.ResourceVariable(
           2.0, caching_device="/job:localhost")
       self.assertEqual("/job:localhost", w.value().device)
-      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+      with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
   def testSharedName(self):
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0c77d1db921566000c2a52e6ddb9d3dddd9b193c..a86b65affec9e491fec13577ad6dc3db610df797 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -23,6 +23,7 @@ import timeit
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
diff --git a/tensorflow/python/kernel_tests/scalar_test.py b/tensorflow/python/kernel_tests/scalar_test.py
index b34426cc21590d585bf7ef7b24b778adbf0cd084..e65241981eac2d42207c1de7a261f7936e588f2a 100644
--- a/tensorflow/python/kernel_tests/scalar_test.py
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
@@ -30,6 +31,7 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ScalarTest(test.TestCase):
 
   def check(self, op, args, error, correct=None):
@@ -51,7 +53,7 @@ class ScalarTest(test.TestCase):
     # Test various GraphDef versions
     for version in strict + lenient:
       with ops.Graph().as_default() as g:
-        g.graph_def_versions.producer = version
+        test_util.set_producer_version(g, version)
         with self.test_session(graph=g) as sess:
           feed = {}
           xs = placeholders(args, feed)
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index be72c1940723ea9f1e22a3b81d2b34ad67a57f4f..bb3f6970e4f18ce174062e2fce488af738b93a50 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -25,11 +25,13 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
@@ -174,8 +176,11 @@ class SoftmaxTest(test.TestCase):
 
   def testDimTooLarge(self):
     with self.test_session():
+      # Use placeholder to make sure we get runtime error instead of shape
+      # inference error.
+      dim = array_ops.placeholder_with_default(100, shape=[])
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax([1., 2., 3., 4.], dim=100).eval()
+        nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval()
 
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
index 762e400447c7e6e89ca4c0b480662aa91e287c26..da116601f833cc6b471e383e030c5fbe93b52ac5 100644
--- a/tensorflow/python/kernel_tests/sparse_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -32,11 +32,12 @@ class SparseSliceOpTest(test.TestCase):
     # [  |11|  |13|14|  ]
     # [20|  |  |23|  |25]
     # [30|  |32|33|  |35]
-    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4],
-                    [2, 0], [2, 3], [2, 5], [3, 0], [3, 2], [3, 3],
-                    [3, 5]]).astype(np.int64)
-    val = np.array(
-        [0, 2, 4, 5, 11, 13, 14, 20, 23, 25, 30, 32, 33, 35]).astype(np.int64)
+    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1,
+                                                                     4], [2, 0],
+                    [2, 3], [2, 5], [3, 0], [3, 2], [3, 3], [3, 5]]).astype(
+                        np.int64)
+    val = np.array([0, 2, 4, 5, 11, 13, 14, 20, 23, 25, 30, 32, 33, 35]).astype(
+        np.int64)
     shape = np.array([4, 6]).astype(np.int64)
     return sparse_tensor.SparseTensor(ind, val, shape)
 
@@ -65,50 +66,49 @@ class SparseSliceOpTest(test.TestCase):
     #  [    |'c1'|    |'d1']
     #  [    |    |'e1'|    ]
     ind = np.array([[0, 0, 0], [0, 0, 1], [0, 2, 0], [0, 2, 1], [1, 1, 0],
-                    [1, 1, 1], [1, 3, 0], [1, 3, 1], [2, 2, 0],
-                    [2, 2, 1]]).astype(np.int64)
+                    [1, 1, 1], [1, 3, 0], [1, 3, 1], [2, 2, 0], [2, 2,
+                                                                 1]]).astype(
+                                                                     np.int64)
     val = np.array(['a0', 'a1', 'b0', 'b1', 'c0', 'c1', 'd0', 'd1', 'e0', 'e1'])
     shape = np.array([3, 4, 2]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
   def _SparseTensor_3x4x2(self):
-    return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_3x4x2(
-    ))
+    return sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_3x4x2())
 
   def testSliceMatrixRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [3, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
-                                                      [0, 5], [1, 1], [1, 3],
-                                                      [1, 4]])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4]])
       self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5, 11, 13, 14])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 6])
-      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 0], [0, 3], [0, 5],
-                                                      [1, 0], [1, 2], [1, 3],
-                                                      [1, 5]])
+      self.assertAllEqual(
+          sp_tensor1.indices.eval(),
+          [[0, 0], [0, 3], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5]])
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
 
   def testSliceMatrixUnevenCols(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_5x7()
+      sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [5, 3])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 3], [5, 2])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 5], [5, 2])
 
-      self.assertAllEqual(sp_tensor0.indices.eval(),
-                          [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2],
-                           [4, 1]])
-      self.assertAllEqual(sp_tensor0.values.eval(),
-                          [0, 2, 11, 20, 30, 32, 41])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2], [4, 1]])
+      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 11, 20, 30, 32, 41])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [5, 3])
       self.assertAllEqual(sp_tensor1.indices.eval(),
                           [[0, 1], [1, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
-      self.assertAllEqual(sp_tensor1.values.eval(),
-                          [4, 13, 14, 23, 33, 44])
+      self.assertAllEqual(sp_tensor1.values.eval(), [4, 13, 14, 23, 33, 44])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [5, 2])
       self.assertAllEqual(sp_tensor2.indices.eval(),
                           [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
@@ -137,7 +137,7 @@ class SparseSliceOpTest(test.TestCase):
 
   def testSliceMatrixUnevenRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_5x7()
+      sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [3, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [3, 0], [3, 7])
       self.assertAllEqual(sp_tensor0.indices.eval(),
@@ -146,9 +146,9 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor0.values.eval(),
                           [0, 2, 4, 5, 11, 13, 14, 16, 20, 23, 25])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [3, 7])
-      self.assertAllEqual(sp_tensor1.indices.eval(),
-                          [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4],
-                           [1, 6]])
+      self.assertAllEqual(
+          sp_tensor1.indices.eval(),
+          [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4], [1, 6]])
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [30, 32, 33, 35, 41, 44, 46])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
@@ -156,9 +156,9 @@ class SparseSliceOpTest(test.TestCase):
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [2, 7])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [4, 0], [2, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(),
-                          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
-                           [1, 4], [1, 6]])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4], [1, 6]])
       self.assertAllEqual(sp_tensor0.values.eval(),
                           [0, 2, 4, 5, 11, 13, 14, 16])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 7])
@@ -166,45 +166,42 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4],
-                                                           [0, 6]])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4], [0, 6]])
       self.assertAllEqual(sp_tensor2.values.eval(), [41, 44, 46])
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
     return
 
   def testSliceAllRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [1, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [1, 0], [1, 6])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [2, 0], [1, 7])
       sp_tensor3 = sparse_ops.sparse_slice(sp_input, [3, 0], [2, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
-                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor0.indices.eval(),
+                          [[0, 0], [0, 2], [0, 4], [0, 5]])
       self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0,
-                                                                          4]])
+      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0, 4]])
       self.assertAllEqual(sp_tensor1.values.eval(), [11, 13, 14])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0,
-                                                                          5]])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0, 5]])
       self.assertAllEqual(sp_tensor2.values.eval(), [20, 23, 25])
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor3.indices.eval(), [[0, 0], [0, 2], [0, 3],
-                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor3.indices.eval(),
+                          [[0, 0], [0, 2], [0, 3], [0, 5]])
       self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
 
   def testSliceColumns(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 2])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
       sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 3])
 
-      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [1, 1],
-                                                             [2, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor0.indices.eval(),
+                          [[0, 0], [1, 1], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor0.values.eval(), [0, 11, 20, 30])
       self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 2])
       self.assertAllEqual(sparse_tensor1.indices.eval(),
@@ -218,15 +215,15 @@ class SparseSliceOpTest(test.TestCase):
 
   def testSliceAllColumns(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 1])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 1], [4, 1])
       sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 2], [4, 1])
       sparse_tensor3 = sparse_ops.sparse_slice(sp_input, [0, 3], [4, 1])
       sparse_tensor4 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 1])
       sparse_tensor5 = sparse_ops.sparse_slice(sp_input, [0, 5], [6, 3])
-      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor0.indices.eval(),
+                          [[0, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor0.values.eval(), [0, 20, 30])
       self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensor1.indices.eval(), [[1, 0]])
@@ -235,17 +232,18 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor2.indices.eval(), [[0, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor2.values.eval(), [2, 32])
       self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor3.indices.eval(), [[1, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor3.indices.eval(),
+                          [[1, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor3.dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensor3.values.eval(), [13, 23, 33])
       self.assertAllEqual(sparse_tensor4.indices.eval(), [[0, 0], [1, 0]])
       self.assertAllEqual(sparse_tensor4.values.eval(), [4, 14])
       self.assertAllEqual(sparse_tensor4.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor5.indices.eval(), [[0, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor5.indices.eval(),
+                          [[0, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 64b3388c5c0fd16436fa77ac5d8d0e8f9a859c32..dd06d303912813733886b9cf20590513760e67f1 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.platform import test
 
 TIMEOUT = 1
 
-class StageTest(test.TestCase):
 
+class StageTest(test.TestCase):
 
   def testSimple(self):
     with ops.Graph().as_default() as G:
@@ -116,7 +116,10 @@ class StageTest(test.TestCase):
         x = array_ops.placeholder(dtypes.int32, name='x')
         p = array_ops.placeholder(dtypes.int32, name='p')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.int32, ], shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]])
         stage = stager.put([x])
         peek = stager.peek(p)
         ret = stager.get()
@@ -162,8 +165,10 @@ class StageTest(test.TestCase):
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.int32, name='x')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.int32, ],
-          capacity=capacity, shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.int32,
+            ], capacity=capacity, shapes=[[]])
         stage = stager.put([x])
         ret = stager.get()
         size = stager.size()
@@ -201,9 +206,8 @@ class StageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -216,16 +220,18 @@ class StageTest(test.TestCase):
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
-    memory_limit = 512*1024  # 512K
-    chunk = 200*1024 # 256K
+    memory_limit = 512 * 1024  # 512K
+    chunk = 200 * 1024  # 256K
     capacity = memory_limit // chunk
 
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.uint8, name='x')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.uint8, ],
-          memory_limit=memory_limit, shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.uint8,
+            ], memory_limit=memory_limit, shapes=[[]])
         stage = stager.put([x])
         ret = stager.get()
         size = stager.size()
@@ -264,9 +270,8 @@ class StageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -277,5 +282,6 @@ class StageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 38205518b528b44313b1de83d06707b4498f061d..8ad29afd0a0f2e7fbaaf2bde956326e578466b1d 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -56,9 +56,11 @@ class TensordotTest(test_lib.TestCase):
         axes_ph = array_ops.placeholder(dtypes.int32)
         output = math_ops.tensordot(a_ph, b_ph, axes_ph)
         _ = sess.run(
-            [output], feed_dict={a_ph: a,
-                                 b_ph: b,
-                                 axes_ph: (a_axes, b_axes)})
+            [output], feed_dict={
+                a_ph: a,
+                b_ph: b,
+                axes_ph: (a_axes, b_axes)
+            })
 
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
@@ -81,28 +83,29 @@ class TensordotTest(test_lib.TestCase):
       with self.test_session() as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           _ = sess.run(
-              [output], feed_dict={a_ph: a,
-                                   b_ph: b,
-                                   axes_ph: axes_value})
+              [output], feed_dict={
+                  a_ph: a,
+                  b_ph: b,
+                  axes_ph: axes_value
+              })
 
   # Test case for 11950
   def test_valid_axis(self):
     for axes_value in [1, 2], [[1], [2]], [[], []], 0:
       with self.test_session() as sess:
-        np_a = np.ones((3,3))
+        np_a = np.ones((3, 3))
         np_b = np.array([2, 3, 1])[None, None]
         np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3,3), dtype=dtypes.float32)
+        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
         tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
         tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
 
         self.assertAllEqual(tf_ans.shape, np_ans.shape)
         self.assertAllEqual(tf_ans, np_ans)
 
-
   def test_partial_shape_inference(self):
-    for axes in ([1],[0]), 1:
+    for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
       b = array_ops.placeholder(dtypes.float32)
       output = math_ops.tensordot(a, b, axes)
@@ -169,9 +172,11 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
           axes = array_ops.placeholder(dtypes.int32)
           c = math_ops.tensordot(a, b, axes)
           tf_ans = sess.run(
-              c, feed_dict={a: a_np,
-                            b: b_np,
-                            axes: (a_dims_np, b_dims_np)})
+              c, feed_dict={
+                  a: a_np,
+                  b: b_np,
+                  axes: (a_dims_np, b_dims_np)
+              })
         else:
           tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval()
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index efb5b9f3641ceaebf1fd5285486b4a9bb93615cf..6ab931fdb97a8945ab610fda27a036693f0291e5 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -58,7 +58,7 @@ class TopKTest(test.TestCase):
         # Do some special casing of equality of indices: if indices
         # are not the same, but values are floating type, ensure that
         # the values are within epsilon of each other.
-        if not np.issubdtype(np_expected_values.dtype, np.float):
+        if not np.issubdtype(np_expected_values.dtype, np.floating):
           # Values are not floating point type; check indices exactly
           self.assertAllEqual(np_expected_indices, indices)
         else:
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 238d4b58d500f3f736df96c79bf04b0943837bb5..8527f116f9541942e52ba2ab635ca1212ea38583 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -131,6 +131,30 @@ class VariableScopeTest(test.TestCase):
       self.assertFalse(v in store.non_trainable_variables())
       self.assertTrue(w in store.non_trainable_variables())
 
+      # Test copying.
+      new_store = store.copy()
+      with new_store.as_default():
+        new_v = variable_scope.get_variable("v")
+        new_w = variable_scope.get_variable("w")
+      self.assertEqual(new_v.numpy(), v.numpy())
+      self.assertEqual(new_w.numpy(), w.numpy())
+      self.assertTrue(new_v in new_store.variables())
+      self.assertTrue(new_w in new_store.variables())
+      self.assertTrue(new_v in new_store.trainable_variables())
+      self.assertFalse(new_w in new_store.trainable_variables())
+      self.assertFalse(new_v in new_store.non_trainable_variables())
+      self.assertTrue(new_w in new_store.non_trainable_variables())
+
+      # Check that variables are separate instances.
+      for v in store.variables():
+        v.assign(-1)
+      for v in new_store.variables():
+        v.assign(1)
+      for v in store.variables():
+        self.assertEqual(v.numpy(), -1)
+      for v in new_store.variables():
+        self.assertEqual(v.numpy(), 1)
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index f60ebf58f6fe81bf75fa4db166449843e5595c7d..b16c8c002c98a0351d1fc55fce061695327a18c9 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -22,6 +22,7 @@ import operator
 
 import numpy as np
 
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -509,6 +510,15 @@ class VariablesTestCase(test.TestCase):
         "<tf.Variable 'noop:0' shape=(5, 5) dtype=float32_ref>",
         repr(var))
 
+  def testVariableNamesPreserveNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope("foo"):
+        v = variables.Variable(0.0, name="bar")
+      self.assertEqual(v.name, "foo/bar:0")
+    with ops.get_default_graph().as_default():
+      create_variable()
+
 
 class IsInitializedTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index c6c7c4e26cb5e4eff22d1bb9d3e32c227c1c838f..e152f02d8e983364603053dc5c8d14b5dfaf3605 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -38,9 +38,8 @@ class XentTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
-    e = np.exp(features - np.reshape(
-        np.amax(
-            features, axis=dim), one_only_on_dim))
+    e = np.exp(
+        features - np.reshape(np.amax(features, axis=dim), one_only_on_dim))
     probs = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
     bp = (probs - labels)
     l = -np.sum(labels * np.log(probs + 1.0e-20), axis=dim)
@@ -85,10 +84,10 @@ class XentTest(test.TestCase):
 
   def testRankTooLarge(self):
     for dtype in np.float16, np.float32:
-      np_features = np.array(
-          [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(dtype)
-      np_labels = np.array(
-          [[[0., 0., 0., 1.]], [[0., .5, .5, 0.]]]).astype(dtype)
+      np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3.,
+                                                    4.]]]).astype(dtype)
+      np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5,
+                                                  0.]]]).astype(dtype)
       self.assertRaisesRegexp(ValueError, "must be rank 2",
                               gen_nn_ops._softmax_cross_entropy_with_logits,
                               np_features, np_labels)
@@ -121,8 +120,8 @@ class XentTest(test.TestCase):
     # = [1.3862, 1.9401]
     np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
     self.assertAllClose(
-        np.array([[0.25, 0.25, 0.25, -0.75],
-                  [0.0321, -0.4129, -0.2632, 0.6439]]),
+        np.array([[0.25, 0.25, 0.25, -0.75], [0.0321, -0.4129, -0.2632,
+                                              0.6439]]),
         np_backprop,
         rtol=1.e-3,
         atol=1.e-3)
@@ -168,15 +167,17 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
 
       # Check that no extra computation performed. When only first derivative is requested,
       # second derivative must not be computed. So when there is no second derivative,
       # there is no `BatchMatMul` op in the graph.
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertNotIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertNotIn("BatchMatMul", op_names)
 
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
@@ -193,24 +194,29 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits_v2(labels=l, logits=f,
-                                                      name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits_v2(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(l, [3, 4], x, [3])
 
     self.assertLess(err, 5e-8)
 
   def testSecondGradient(self):
     with self.test_session() as sess:
-      l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
-                                1.0/3, 0.0, 0.0, 0.0,
-                                0.0, 0.5/3, 0.0, 0.5/3], shape=[12],
-                               dtype=dtypes.float64, name="l")
-      f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
-                                0.1, 0.4, 0.9, 1.6,
-                                0.1, 0.8, 2.7, 6.4], shape=[12],
-                               dtype=dtypes.float64, name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      l = constant_op.constant(
+          [
+              0.0, 0.0, 1.0 / 3, 0.0, 1.0 / 3, 0.0, 0.0, 0.0, 0.0, 0.5 / 3, 0.0,
+              0.5 / 3
+          ],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="l")
+      f = constant_op.constant(
+          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="f")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       loss = math_ops.reduce_sum(x)
 
       gradients = gradients_impl.gradients(loss, [f])[0]
@@ -219,20 +225,23 @@ class XentTest(test.TestCase):
 
       # Check that second derivative is calculated.
       # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertIn("BatchMatMul", op_names)
 
     print("cross entropy hessian err = ", err)
     self.assertLess(err, 5e-8)
 
   def testWrapper(self):
-    features = np.array(
-        [[[1., 1., 1., 1.], [1., 2., 3., 4.]],
-         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
-         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32)
+    features = np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
+                         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
+                         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(
+                             np.float32)
     labels = np.array([[[0., 0., 0., 1.], [0., 1., 0., 0.]],
                        [[0., 0.5, 0.5, 0.], [0.5, 0.5, 0., 0.]],
-                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(np.float32)
+                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(
+                           np.float32)
     self._testXentWrapper(features, labels, dim=0, use_gpu=False)
     self._testXentWrapper(features, labels, dim=0, use_gpu=True)
     self._testXentWrapper(features, labels, dim=1, use_gpu=False)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 00faf3faa1004ddbb310137500dbec0db4a52196..5dea732cbaa43a40f6a1bc4beef729f3b84dad5c 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import utils as layers_util
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
@@ -99,8 +100,16 @@ class Layer(object):
         raise TypeError('Keyword argument not understood:', kwarg)
 
     # Mutable properties
+    # Indicates whether the layer's weights are updated during training
+    # and whether the layer's updates are run during training
     self.trainable = trainable
+    # A stateful layer is a layer whose updates are run during inference too,
+    # for instance stateful RNNs.
+    self.stateful = False
+    # Indicates whether `build` needs to be called upon layer call, to create
+    # the layer's weights.
     self.built = False
+    # Provides information about which inputs are compatible with the layer.
     self.input_spec = None
 
     if activity_regularizer and context.in_eager_mode():
@@ -131,9 +140,6 @@ class Layer(object):
 
     self._init_set_name(name)
 
-    # Holds functions for creating regularizer ops.
-    self._regularizer_factories = []
-
     # Determine variable scope.
     scope = kwargs.get('_scope')
     if scope:
@@ -223,6 +229,8 @@ class Layer(object):
   def updates(self):
     if context.in_eager_mode():
       raise RuntimeError('Layer.updates not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
     return self._updates
 
   def add_update(self, updates, inputs=None):
@@ -284,6 +292,8 @@ class Layer(object):
     """
     if context.in_eager_mode():
       raise RuntimeError('Layer.get_updates_for not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
     if inputs is not None:
       inputs = nest.flatten(inputs)
     if not inputs:
@@ -294,22 +304,6 @@ class Layer(object):
       inputs_hash = None
     return self._per_input_updates.get(inputs_hash, [])
 
-  def _get_regularizer_factories(self):
-    try:
-      # Some subclasses of Layer do not use its constructor.
-      return self._regularizer_factories
-    except AttributeError:
-      self._regularizer_factories = []
-      return self._regularizer_factories
-
-  def _maybe_create_variable_regularizers(self):
-    """Creates added but uninstantiated regularizers."""
-    factories = self._get_regularizer_factories()
-    if factories:
-      for factory in factories:
-        factory()
-      factories[:] = []
-
   @property
   def losses(self):
     """Losses which are associated with this `Layer`.
@@ -321,7 +315,6 @@ class Layer(object):
     Returns:
       A list of tensors.
     """
-    self._maybe_create_variable_regularizers()
     if context.in_eager_mode():
       # _losses may only contain variable regularization losses when executing
       # eagerly, and they have been saved as lambdas to be executed when
@@ -405,7 +398,6 @@ class Layer(object):
       inputs_hash = layers_util.object_list_uid(inputs)
     else:
       inputs_hash = None
-    self._maybe_create_variable_regularizers()
     return self._per_input_losses.get(inputs_hash, [])
 
   def build(self, _):
@@ -500,13 +492,30 @@ class Layer(object):
       instance is returned.
 
     Raises:
-      RuntimeError: If called in Eager mode with partioned variable
-        regularization.
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
     """
 
-    in_graph_mode = context.in_graph_mode()
-    if in_graph_mode:
-      existing_variables = set(tf_variables.global_variables())
+    # `init_graph` should point to the graph in which variable initialization
+    # will occur; it should be None if and only if initialization will take
+    # place in the eager context.
+    init_graph = None
+    if context.in_graph_mode():
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if context.in_graph_mode():
+            init_graph = ops.get_default_graph()
+            existing_variables = set(tf_variables.global_variables())
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+        existing_variables = set(tf_variables.global_variables())
+
     if dtype is None:
       dtype = self.dtype or dtypes.float32
 
@@ -523,54 +532,51 @@ class Layer(object):
                                    trainable=trainable and self.trainable,
                                    partitioner=partitioner)
 
-        if in_graph_mode:
-          if (trainable and self.trainable
-              and variable not in tf_variables.trainable_variables()):
-            # A custom getter / variable scope overrode the trainable flag.
-            trainable = False
+        if init_graph is not None:  # pylint: disable=protected-access
+          # The variable was created and initialized in a graph.
+
           if variable in existing_variables:
             # To match the behavior of tf.get_variable(), we only apply
             # regularization if the variable is newly created.
             return variable
 
-        if regularizer:
-          def regularizer_factory():
-            if context.in_graph_mode():
-              with vs.variable_scope(scope, reuse=reuse,
-                                     auxiliary_name_scope=False):
-                with ops.name_scope(self._name_scope_name(scope)):
-                  if isinstance(variable, tf_variables.PartitionedVariable):
-                    for v in variable:
-                      with ops.colocate_with(v.op):
-                        with ops.name_scope(name + '/Regularizer'):
-                          regularization = regularizer(v)
-                      if regularization is not None:
-                        self.add_loss(regularization)
-                  else:
-                    with ops.colocate_with(variable.op):
-                      with ops.name_scope(name + '/Regularizer'):
-                        regularization = regularizer(variable)
-                    if regularization is not None:
-                      self.add_loss(regularization)
+          with init_graph.as_default():
+            trainable_variables = tf_variables.trainable_variables()
+          if (trainable and self.trainable and
+              variable not in trainable_variables):
+            # A custom getter / variable scope overrode the trainable flag.
+            trainable = False
+
+          if regularizer:
+            if isinstance(variable, tf_variables.PartitionedVariable):
+              for v in variable:
+                with ops.colocate_with(v.op):
+                  with ops.name_scope(name + '/Regularizer'):
+                    regularization = regularizer(v)
+                if regularization is not None:
+                  self.add_loss(regularization)
             else:
-              if isinstance(variable, tf_variables.PartitionedVariable):
-                raise RuntimeError(
-                    'Partitioned variable regularization is not yet '
-                    'supported when executing eagerly. File a feature request'
-                    'if this is important to you.')
-              # Save a zero-argument lambda which runs the regularizer on the
-              # variable, to be executed when `Layer.losses` is requested.
-              # This makes losses responsive to variable updates when
-              # executing eagerly.
-              self._losses.append(lambda: regularizer(variable))
-
-          if hasattr(self, '_defer_regularizers') and self._defer_regularizers:
-            # _defer_regularizers exists and is set to True if `build` was
-            # invoked in `__call__`: deferring regularizer construction
-            # prevents the regularizer from being created in an `init_scope`.
-            self._get_regularizer_factories().append(regularizer_factory)
-          else:
-            regularizer_factory()
+              with ops.colocate_with(variable.op):
+                with ops.name_scope(name + '/Regularizer'):
+                  regularization = regularizer(variable)
+              if regularization is not None:
+                self.add_loss(regularization)
+        elif regularizer:  # and initialization took place in an eager context
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            raise RuntimeError(
+                'Partitioned variable regularization is not yet '
+                'supported when executing eagerly. File a feature request'
+                'if this is important to you.')
+          # Save a zero-argument lambda which runs the regularizer on the
+          # variable, to be executed when `Layer.losses` is requested.
+          # This makes losses responsive to variable updates when executing
+          # eagerly.
+          #
+          # TODO(akshayka): Do the same for graphs as well, so that losses
+          # collected in a while_loop can be run outside its control flow
+          # context and so that losses won't be swallowed up by graph functions
+          # (i.e., `.losses()` should always create regularizers).
+          self._losses.append(lambda: regularizer(variable))
 
     if trainable:
       self._trainable_weights.append(variable)
@@ -644,6 +650,7 @@ class Layer(object):
     else:
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse, auxiliary_name_scope=False)
+    input_shapes = None
     with scope_context_manager as scope:
       with ops.name_scope(self._name_scope_name(scope)):
         if not self.built:
@@ -670,15 +677,7 @@ class Layer(object):
             except AttributeError:
               pass
           input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-
-          # Signal to `add_variable` that regularizer construction should be
-          # deferred.
-          self._defer_regularizers = True
-          with ops.init_scope():
-            self.build(input_shapes)
-          # Create any regularizers added by `build`.
-          self._maybe_create_variable_regularizers()
-          self._defer_regularizers = False
+          self.build(input_shapes)
         try:
           # Note: not all sub-classes of Layer call Layer.__init__ (especially
           # the ones under tensorflow/python/keras). Hence we recompute this
@@ -701,6 +700,9 @@ class Layer(object):
         else:
           # Deferred mode behavior: use `compute_output_shape` to
           # infer the number of outputs of the layer and their shapes.
+          if input_shapes is None:
+            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
           output_shapes = self.compute_output_shape(input_shapes)
           output_shapes = nest.flatten(output_shapes)
           outputs = [
@@ -1263,6 +1265,15 @@ class InputSpec(object):
     self.min_ndim = min_ndim
     self.axes = axes or {}
 
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
@@ -1387,7 +1398,10 @@ class _DeferredTensor(object):
 
   def __init__(self, shape, dtype, name=None):
     self.shape = tensor_shape.TensorShape(shape)
-    self.dtype = dtypes.as_dtype(dtype)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
     self.name = name
 
   def get_shape(self):
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index d5147b237b3ce666d735548386faf8572cca1cac..e8dba3cea321a415b84e1ec89fd7b021e2b272d0 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -1904,6 +1904,7 @@ class Conv3DTranspose(Conv3D):
           dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1974,6 +1975,8 @@ class Conv3DTranspose(Conv3D):
 
     if self.use_bias:
       outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
       if self.data_format == 'channels_first':
         outputs_4d = array_ops.reshape(outputs, [
             outputs_shape[0], outputs_shape[1],
@@ -2007,11 +2010,11 @@ class Conv3DTranspose(Conv3D):
 
     output_shape[c_axis] = self.filters
     output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], stride_d, kernel_d, self.padding)
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
     output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], stride_h, kernel_h, self.padding)
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
     output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], stride_w, kernel_w, self.padding)
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
     return tensor_shape.TensorShape(output_shape)
 
 
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e5b93a54f79bef68d96ab7efccc883033e7001c7..7bf62d45b8e97aaa027467b6c9862ca2a4699fc1 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -49,9 +49,6 @@ class Dense(base.Layer):
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the input to the layer has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     units: Integer or Long, dimensionality of the output space.
     activation: Activation function (callable). Set it to None to maintain a
@@ -199,9 +196,6 @@ def dense(
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the `inputs` tensor has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     inputs: Tensor input.
     units: Integer or Long, dimensionality of the output space.
@@ -230,7 +224,8 @@ def dense(
       by the same name.
 
   Returns:
-    Output tensor.
+    Output tensor the same shape as `inputs` except the last dimension is of
+    size `units`.
 
   Raises:
     ValueError: if eager execution is enabled.
diff --git a/tensorflow/python/layers/maxout.py b/tensorflow/python/layers/maxout.py
index ed048845a0b88344b357836a838231677cbf40ce..20ce6c9770087f9cfb90d40235955dfe1b7ee98b 100644
--- a/tensorflow/python/layers/maxout.py
+++ b/tensorflow/python/layers/maxout.py
@@ -31,15 +31,18 @@ from tensorflow.python.layers import base
 def maxout(inputs, num_units, axis=-1, name=None):
   """Adds a maxout op from https://arxiv.org/abs/1302.4389
 
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville,
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville,
    Yoshua Bengio
 
-   Usually the operation is performed in the filter/channel dimension. This can also be
+   Usually the operation is performed in the filter/channel dimension. This can
+   also be
    used after fully-connected layers to reduce number of features.
 
    Arguments:
    inputs: Tensor input
-   num_units: Specifies how many features will remain after maxout in the `axis` dimension
+   num_units: Specifies how many features will remain after maxout in the `axis`
+     dimension
          (usually channel). This must be multiple of number of `axis`.
    axis: The dimension where max pooling will be performed. Default is the
    last dimension.
@@ -57,15 +60,18 @@ def maxout(inputs, num_units, axis=-1, name=None):
 class MaxOut(base.Layer):
   """Adds a maxout op from https://arxiv.org/abs/1302.4389
 
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, Yoshua
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville, Yoshua
   Bengio
 
-  Usually the operation is performed in the filter/channel dimension. This can also be
+  Usually the operation is performed in the filter/channel dimension. This can
+  also be
   used after fully-connected layers to reduce number of features.
 
   Arguments:
     inputs: Tensor input
-    num_units: Specifies how many features will remain after maxout in the `axis` dimension
+    num_units: Specifies how many features will remain after maxout in the
+      `axis` dimension
          (usually channel).
     This must be multiple of number of `axis`.
     axis: The dimension where max pooling will be performed. Default is the
@@ -79,13 +85,8 @@ class MaxOut(base.Layer):
     ValueError: if num_units is not multiple of number of features.
   """
 
-  def __init__(self,
-         num_units,
-         axis=-1,
-         name=None,
-         **kwargs):
-    super(MaxOut, self).__init__(
-      name=name, trainable=False, **kwargs)
+  def __init__(self, num_units, axis=-1, name=None, **kwargs):
+    super(MaxOut, self).__init__(name=name, trainable=False, **kwargs)
     self.axis = axis
     self.num_units = num_units
 
@@ -95,8 +96,8 @@ class MaxOut(base.Layer):
     num_channels = shape[self.axis]
     if num_channels % self.num_units:
       raise ValueError('number of features({}) is not '
-               'a multiple of num_units({})'
-               .format(num_channels, self.num_units))
+                       'a multiple of num_units({})'.format(
+                           num_channels, self.num_units))
     shape[self.axis] = -1
     shape += [num_channels // self.num_units]
 
@@ -104,6 +105,7 @@ class MaxOut(base.Layer):
     for i in range(len(shape)):
       if shape[i] is None:
         shape[i] = gen_array_ops.shape(inputs)[i]
-    outputs = math_ops.reduce_max(gen_array_ops.reshape(inputs, shape), -1, keep_dims=False)
+    outputs = math_ops.reduce_max(
+        gen_array_ops.reshape(inputs, shape), -1, keep_dims=False)
 
     return outputs
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index ade57da411d67241e027e0bb559e49bc3c077e6d..745843975c487c34ab854ad2bb52b93e617fcaec 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -574,6 +574,11 @@ class GraphNetwork(base.Layer):
         return layer
     raise ValueError('No such layer: ' + name)
 
+  @property
+  def stateful(self):
+    return any([(hasattr(layer, 'stateful') and layer.stateful)
+                for layer in self.layers])
+
   @property
   def updates(self):
     """Retrieve the network's updates.
@@ -586,6 +591,8 @@ class GraphNetwork(base.Layer):
     Returns:
         A list of update ops.
     """
+    if not self.trainable and not self.stateful:
+      return []
     updates = []
     for layer in self.layers:
       if hasattr(layer, 'updates'):
@@ -614,6 +621,11 @@ class GraphNetwork(base.Layer):
         A list of loss tensors.
     """
     losses = []
+    if context.in_eager_mode():
+      for layer in self.layers:
+        losses += layer.losses
+      return losses
+
     # Retrieve losses for all internal layers.
     for layer in self.layers:
       if hasattr(layer, 'losses'):
@@ -846,7 +858,6 @@ class GraphNetwork(base.Layer):
       for node in nodes:
         # This is always a single layer, never a list.
         layer = node.outbound_layer
-
         reference_input_tensors = node.input_tensors
         reference_output_tensors = node.output_tensors
 
@@ -894,12 +905,13 @@ class GraphNetwork(base.Layer):
               else:
                 output_masks = [None for _ in range(len(output_tensors))]
 
-            # Apply activity regularizer if any:
-            if layer.activity_regularizer is not None:
-              regularization_losses = [
-                  layer.activity_regularizer(x) for x in computed_tensors
-              ]
-              layer.add_loss(regularization_losses, computed_tensors)
+            if context.in_graph_mode():
+              if layer.activity_regularizer is not None:
+                regularization_losses = [
+                    layer.activity_regularizer(x) for x in computed_tensors
+                ]
+                # Apply activity regularizer if any:
+                layer.add_loss(regularization_losses, computed_tensors)
 
           if context.in_graph_mode():
             # Update model updates and losses:
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index e4d4ed4a2a2203a71e1c3db879f282543d3b8de7..7533674e5a0cf60f91551cd6333c8d802612e03d 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -96,33 +96,41 @@ class PoolingTest(test.TestCase):
 
   def testCreateMaxPooling1D(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, width, channels))
     layer = pooling_layers.MaxPooling1D(2, strides=2)
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, width // 2, channels])
 
   def testCreateAveragePooling1D(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, width, channels))
     layer = pooling_layers.AveragePooling1D(2, strides=2)
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, width // 2, channels])
 
   def testCreateMaxPooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, 4, width))
+    channels = 3
+    images = random_ops.random_uniform((5, channels, width))
     layer = pooling_layers.MaxPooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, channels, width // 2])
 
   def testCreateAveragePooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, 4, width))
+    channels = 3
+    images = random_ops.random_uniform((5, channels, width))
     layer = pooling_layers.AveragePooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, channels, width // 2])
 
   def testCreateMaxPooling3D(self):
     depth, height, width = 6, 7, 9
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index e8be347799acf2e92e79ca76f44f25d573489940..7407d9a7b30268271810a23a7146c92fd28f8d4e 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -81,7 +81,7 @@ def normalize_tuple(value, n, name):
     for single_value in value_tuple:
       try:
         int(single_value)
-      except ValueError:
+      except (ValueError, TypeError):
         raise ValueError('The `' + name + '` argument must be a tuple of ' +
                          str(n) + ' integers. Received: ' + str(value) + ' '
                          'including element ' + str(single_value) + ' of type' +
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index d3bfa0ee337d1f606e5e994406969685a2986ab4..e0422ef80add42307268be2743e668eb8c8acb68 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "numpy/arrayobject.h"
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -53,6 +54,12 @@ struct PyCall {
   // with this "token".
   string token;
 
+  // The device on which Tensors are stored; only used for EagerPyFunc.
+  Device* device;
+
+  // True if and only if the op has been placed on a GPU.
+  bool gpu;
+
   // True if the call is associated with an EagerPyFunc.
   bool eager;
 
@@ -71,7 +78,12 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     PyObject* arg = nullptr;
     const Tensor& t = call->ins[i];
     if (call->eager) {
-      arg = EagerTensorFromHandle(TFE_NewTensorHandle(t));
+      if (call->gpu) {
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, call->device));
+      } else {
+        // TFE_TensorHandle assumes that CPU is identified by `nullptr`.
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr));
+      }
       if (arg == nullptr) {
         return errors::Internal("Unable to procure EagerTensor from Tensor.");
       }
@@ -84,7 +96,8 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     }
     PyList_SetItem(lst, i, arg);
   }
-  *tuple = Py_BuildValue("(sN)", call->token.c_str(), lst);
+  *tuple = Py_BuildValue("(sON)", call->token.c_str(),
+                         call->gpu ? Py_True : Py_False, lst);
   CHECK(*tuple);
   return Status::OK();
 }
@@ -150,15 +163,9 @@ bool IsSingleNone(PyObject* obj) {
 }
 
 // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
-Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
-                                    Tensor* output_tensor,
-                                    TF_Status* tf_status) {
-  // TODO(akshayka): Lift the restriction requiring output tensors to
-  // lie in host memory; EagerPyFunc should be able to dispatch ops on GPU
-  // tensors, so we should eventually implement a GPU kernel for EagerPyFunc.
-  *output_tensor = *TFE_TensorHandleUnderlyingTensorInHostMemory(
-      EagerTensor_Handle(eager_tensor), tf_status);
-  return StatusFromTF_Status(tf_status);
+void ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                  Tensor* output_tensor) {
+  *output_tensor = EagerTensor_Handle(eager_tensor)->t;
 }
 
 // Calls the registered py function through the trampoline.
@@ -201,15 +208,23 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
   }
 
   // Process the return values and convert them to TF Tensors.
-  Status s;
+  Status s = Status::OK();
   if (PyList_Check(result)) {
+    // `result` is a Python list; if this operation is an `EagerPyFunc`, then
+    // every item in the list must be an `EagerTensor`; otherwise, every element
+    // must be a NumPy array.
     call->out.clear();
     for (int i = 0; i < PyList_Size(result); ++i) {
       Tensor t;
       if (call->eager) {
-        auto tf_status = tensorflow::make_safe(TF_NewStatus());
-        s = ExtractTensorFromEagerTensor(PyList_GetItem(result, i), &t,
-                                         tf_status.get());
+        const PyObject* item = PyList_GetItem(result, i);
+        if (EagerTensor_CheckExact(item)) {
+          ExtractTensorFromEagerTensor(item, &t);
+        } else {
+          s = errors::FailedPrecondition(
+              "Expected EagerTensor, found PyObject of type: ",
+              Py_TYPE(item)->tp_name);
+        }
       } else {
         s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
       }
@@ -220,16 +235,15 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       call->out.push_back(t);
     }
   } else if (EagerTensor_CheckExact(result) || result == Py_None) {
+    // result is an `EagerTensor` or `None`.
     DCHECK(call->eager);
     Tensor t;
     if (result != Py_None) {
-      auto tf_status = tensorflow::make_safe(TF_NewStatus());
-      s = ExtractTensorFromEagerTensor(result, &t, tf_status.get());
-      if (s.ok()) {
-        call->out.push_back(t);
-      }
+      ExtractTensorFromEagerTensor(result, &t);
+      call->out.push_back(t);
     }
   } else if (PyArray_Check(result)) {
+    // `result` is a NumPy array.
     DCHECK(!call->eager);
     if (!IsSingleNone(result)) {
       Tensor t;
@@ -239,7 +253,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       }
     }
   } else {
-    s = errors::Internal("Unexpected pyobject is returned: ",
+    s = errors::Internal("Unexpected PyObject was returned: ",
                          Py_TYPE(result)->tp_name);
   }
   Py_DECREF(result);
@@ -429,12 +443,24 @@ class PyFuncOp : public OpKernel {
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
     eager_ = type_string() == "EagerPyFunc";
+    gpu_ = ctx->device_type().type_string() == DEVICE_GPU;
   }
 
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
+    call.gpu = gpu_;
     call.eager = eager_;
+    if (call.eager) {
+      // Eager's C API uses `Device`, whereas `OpKernelContext` stores a
+      // `DeviceBase`; attempt to downcast.
+      call.device = dynamic_cast<Device*>(ctx->device());
+      if (call.device == nullptr) {
+        ctx->CtxFailureWithWarning(
+            errors::Internal("Unrecognized device class"));
+      }
+    }
+
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       call.ins.push_back(ctx->input(i));
     }
@@ -476,6 +502,9 @@ class PyFuncOp : public OpKernel {
  private:
   string token_;
 
+  // True if and only if this op has been placed on a GPU.
+  bool gpu_;
+
   // True if and only if this op should execute the python function eagerly,
   // i.e., if and only if the eager attribute is set.
   bool eager_;
@@ -486,5 +515,6 @@ class PyFuncOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp);
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_GPU), PyFuncOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 4e3071d8513a28b02b70b290c4987bec92b3c32e..59f5075f177ef5335115cb4f24182d28a9b547c8 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
 class FileIO(object):
@@ -235,6 +236,7 @@ class FileIO(object):
     self._writable_file = None
 
 
+@tf_export("gfile.Exists")
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -256,6 +258,7 @@ def file_exists(filename):
   return True
 
 
+@tf_export("gfile.Remove")
 def delete_file(filename):
   """Deletes the file located at 'filename'.
 
@@ -306,6 +309,7 @@ def write_string_to_file(filename, file_content):
     f.write(file_content)
 
 
+@tf_export("gfile.Glob")
 def get_matching_files(filename):
   """Returns a list of files that match the given pattern(s).
 
@@ -336,6 +340,7 @@ def get_matching_files(filename):
       ]
 
 
+@tf_export("gfile.MkDir")
 def create_dir(dirname):
   """Creates a directory with the name 'dirname'.
 
@@ -353,6 +358,7 @@ def create_dir(dirname):
     pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.MakeDirs")
 def recursive_create_dir(dirname):
   """Creates a directory and all parent/intermediate directories.
 
@@ -368,6 +374,7 @@ def recursive_create_dir(dirname):
     pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.Copy")
 def copy(oldpath, newpath, overwrite=False):
   """Copies data from oldpath to newpath.
 
@@ -385,6 +392,7 @@ def copy(oldpath, newpath, overwrite=False):
         compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
 
 
+@tf_export("gfile.Rename")
 def rename(oldname, newname, overwrite=False):
   """Rename or move a file / directory.
 
@@ -426,6 +434,7 @@ def atomic_write_string_to_file(filename, contents, overwrite=True):
     raise
 
 
+@tf_export("gfile.DeleteRecursively")
 def delete_recursively(dirname):
   """Deletes everything under dirname recursively.
 
@@ -439,6 +448,7 @@ def delete_recursively(dirname):
     pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.IsDirectory")
 def is_directory(dirname):
   """Returns whether the path is a directory or not.
 
@@ -452,6 +462,7 @@ def is_directory(dirname):
   return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.ListDirectory")
 def list_directory(dirname):
   """Returns a list of entries contained within a directory.
 
@@ -479,6 +490,7 @@ def list_directory(dirname):
     ]
 
 
+@tf_export("gfile.Walk")
 def walk(top, in_order=True):
   """Recursive directory tree generator for directories.
 
@@ -522,6 +534,7 @@ def walk(top, in_order=True):
     yield here
 
 
+@tf_export("gfile.Stat")
 def stat(filename):
   """Returns file statistics for a given path.
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index df190100689bd864de78f5a2cf52b1ade081a789..48ea107a146c2714f7b59f53abbcd8b60dbf2fd4 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -22,8 +22,10 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("python_io.TFRecordCompressionType")
 class TFRecordCompressionType(object):
   """The type of compression for the record."""
   NONE = 0
@@ -33,6 +35,7 @@ class TFRecordCompressionType(object):
 
 # NOTE(vrv): This will eventually be converted into a proto.  to match
 # the interface used by the C++ RecordWriter.
+@tf_export("python_io.TFRecordOptions")
 class TFRecordOptions(object):
   """Options used for manipulating TFRecord files."""
   compression_type_map = {
@@ -51,6 +54,7 @@ class TFRecordOptions(object):
     return cls.compression_type_map[options.compression_type]
 
 
+@tf_export("python_io.tf_record_iterator")
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
@@ -81,6 +85,7 @@ def tf_record_iterator(path, options=None):
   reader.Close()
 
 
+@tf_export("python_io.TFRecordWriter")
 class TFRecordWriter(object):
   """A class to write records to a TFRecords file.
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 55cae0bcbfca8a9cacfe525fe3b69c7fb232acd3..c9292184e6a9910db6b41022ab01312ce44e0a96 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Gradients for operators defined in array_ops.py."""
 
 from __future__ import absolute_import
@@ -131,8 +130,8 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
         # extract the size of each input along the concat dimension
         sizes = array_ops.squeeze(
             array_ops.slice(
-                array_ops.stack(
-                    sizes, axis=1), [non_neg_concat_dim, 0], [1, -1]))
+                array_ops.stack(sizes, axis=1), [non_neg_concat_dim, 0],
+                [1, -1]))
         out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
       else:
         offset = gen_array_ops._concat_offset(non_neg_concat_dim, sizes)
@@ -167,8 +166,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
         new_values = array_ops.slice(
             grad.values, begin,
             array_ops.concat([[-1], array_ops.slice(size, [1], [-1])], 0))
-        out_grads.append(
-            ops.IndexedSlices(new_values, grad.indices, size))
+        out_grads.append(ops.IndexedSlices(new_values, grad.indices, size))
         # Lint complains begin = begin + ...
         begin = math_ops.add(begin, size * mask)
     else:
@@ -178,30 +176,33 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
       for size in sizes:
         size_concat_dim = array_ops.gather(size, non_neg_concat_dim)
         if size_concat_dim.dtype != grad.indices.dtype:
-          size_concat_dim = math_ops.cast(size_concat_dim,
-                                          dtype=grad.indices.dtype)
+          size_concat_dim = math_ops.cast(
+              size_concat_dim, dtype=grad.indices.dtype)
         end = start + size_concat_dim
         # Compute the 1-D Tensor of indices relevant for this input.
         indices_to_select = array_ops.squeeze(
-            array_ops.where(math_ops.logical_and(grad.indices >= start,
-                                                 grad.indices < end)),
+            array_ops.where(
+                math_ops.logical_and(grad.indices >= start,
+                                     grad.indices < end)),
             squeeze_dims=[1])
         new_indices = array_ops.gather(grad.indices, indices_to_select) - start
         new_values = array_ops.gather(grad.values, indices_to_select)
-        out_grads.append(
-            ops.IndexedSlices(new_values, new_indices, size))
+        out_grads.append(ops.IndexedSlices(new_values, new_indices, size))
         start = end
   else:
     raise TypeError("Expected Tensor or IndexedSlices, got %s" % type(grad))
 
-  return (out_grads + [None] if end_value_index <= dim_index
-          else [None] + out_grads)
+  return (out_grads + [None]
+          if end_value_index <= dim_index else [None] + out_grads)
 
 
 @ops.RegisterGradient("Concat")
 def _ConcatGrad(op, grad):
   return _ConcatGradHelper(
-      op, grad, start_value_index=1, end_value_index=len(op.inputs),
+      op,
+      grad,
+      start_value_index=1,
+      end_value_index=len(op.inputs),
       dim_index=0)
 
 
@@ -287,9 +288,13 @@ def _SplitGrad(op, *grads):
 @ops.RegisterGradient("SplitV")
 def _SplitVGrad(op, *grads):
   returnval = array_ops.concat(list(grads), op.inputs[2])
-  returnval = [returnval] + [None,] * (len(op.inputs) - 1)
+  returnval = [returnval] + [
+      None,
+  ] * (
+      len(op.inputs) - 1)
   return returnval
 
+
 ops.NotDifferentiable("Const")
 
 
@@ -334,9 +339,9 @@ def _MatrixSetDiagGrad(op, grad):
       matrix_shape = array_ops.slice(grad_shape, [grad_rank - 2], [2])
       min_dim = math_ops.reduce_min(matrix_shape)
       diag_shape = array_ops.concat([batch_shape, [min_dim]], 0)
-  grad_input = array_ops.matrix_set_diag(
-      grad, array_ops.zeros(
-          diag_shape, dtype=grad.dtype))
+  grad_input = array_ops.matrix_set_diag(grad,
+                                         array_ops.zeros(
+                                             diag_shape, dtype=grad.dtype))
   grad_diag = array_ops.matrix_diag_part(grad)
   return (grad_input, grad_diag)
 
@@ -444,8 +449,8 @@ def _GatherV2Grad(op, grad):
   values_transpose = array_ops.transpose(values, transpose_dims)
   num_segments = params_shape[axis]
 
-  params_grad = math_ops.unsorted_segment_sum(
-      values_transpose, indices, num_segments)
+  params_grad = math_ops.unsorted_segment_sum(values_transpose, indices,
+                                              num_segments)
 
   # Inverts the above transpose by moving dimension 0 back to its original
   # position.
@@ -536,13 +541,10 @@ def _ConjugateTransposeGrad(op, grad):
 
 ops.NotDifferentiable("Shape")
 
-
 ops.NotDifferentiable("ShapeN")
 
-
 ops.NotDifferentiable("Rank")
 
-
 ops.NotDifferentiable("Size")
 
 
@@ -590,6 +592,7 @@ def _PadGrad(op, grad):
   else:
     return x_grad, None
 
+
 ops.RegisterGradient("Pad")(_PadGrad)
 ops.RegisterGradient("PadV2")(_PadGrad)
 
@@ -625,30 +628,34 @@ def _ReverseV2Grad(op, grad):
 def _SpaceToBatchGrad(op, grad):
   # Its gradient is the opposite op: BatchToSpace.
   block_size = op.get_attr("block_size")
-  return [array_ops.batch_to_space(grad, op.inputs[1], block_size=block_size),
-          None]
+  return [
+      array_ops.batch_to_space(grad, op.inputs[1], block_size=block_size), None
+  ]
 
 
 @ops.RegisterGradient("SpaceToBatchND")
 def _SpaceToBatchNDGrad(op, grad):
   # Its gradient is the opposite op: BatchToSpaceND.
-  return [array_ops.batch_to_space_nd(grad, op.inputs[1], op.inputs[2]),
-          None, None]
+  return [
+      array_ops.batch_to_space_nd(grad, op.inputs[1], op.inputs[2]), None, None
+  ]
 
 
 @ops.RegisterGradient("BatchToSpace")
 def _BatchToSpaceGrad(op, grad):
   # Its gradient is the opposite op: SpaceToBatch.
   block_size = op.get_attr("block_size")
-  return [array_ops.space_to_batch(grad, op.inputs[1], block_size=block_size),
-          None]
+  return [
+      array_ops.space_to_batch(grad, op.inputs[1], block_size=block_size), None
+  ]
 
 
 @ops.RegisterGradient("BatchToSpaceND")
 def _BatchToSpaceNDGrad(op, grad):
   # Its gradient is the opposite op: SpaceToBatchND.
-  return [array_ops.space_to_batch_nd(grad, op.inputs[1], op.inputs[2]),
-          None, None]
+  return [
+      array_ops.space_to_batch_nd(grad, op.inputs[1], op.inputs[2]), None, None
+  ]
 
 
 @ops.RegisterGradient("SpaceToDepth")
@@ -712,30 +719,28 @@ def _QuantizeAndDequantizeV3Grad(_, grad):
 def _ExtractImagePatchesGrad(op, grad):
 
   batch_size, rows_in, cols_in, channels = [
-    dim.value for dim in op.inputs[0].get_shape()
+      dim.value for dim in op.inputs[0].get_shape()
   ]
   input_bhwc = array_ops.shape(op.inputs[0])
   batch_size = input_bhwc[0]
   channels = input_bhwc[3]
 
-  _, rows_out, cols_out, _ = [
-    dim.value for dim in op.outputs[0].get_shape()
-  ]
-  _, ksize_r, ksize_c, _ = op.get_attr('ksizes')
-  _, stride_r, stride_h, _ = op.get_attr('strides')
-  _, rate_r, rate_c, _ = op.get_attr('rates')
-  padding = op.get_attr('padding')
+  _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].get_shape()]
+  _, ksize_r, ksize_c, _ = op.get_attr("ksizes")
+  _, stride_r, stride_h, _ = op.get_attr("strides")
+  _, rate_r, rate_c, _ = op.get_attr("rates")
+  padding = op.get_attr("padding")
 
   ksize_r_eff = ksize_r + (ksize_r - 1) * (rate_r - 1)
   ksize_c_eff = ksize_c + (ksize_c - 1) * (rate_c - 1)
 
-  if padding == b'SAME':
+  if padding == b"SAME":
     rows_out = int(ceil(rows_in / stride_r))
     cols_out = int(ceil(cols_in / stride_h))
     pad_rows = ((rows_out - 1) * stride_r + ksize_r_eff - rows_in) // 2
     pad_cols = ((cols_out - 1) * stride_h + ksize_c_eff - cols_in) // 2
 
-  elif padding == b'VALID':
+  elif padding == b"VALID":
     rows_out = int(ceil((rows_in - ksize_r_eff + 1) / stride_r))
     cols_out = int(ceil((cols_in - ksize_c_eff + 1) / stride_h))
     pad_rows = (rows_out - 1) * stride_r + ksize_r_eff - rows_in
@@ -744,10 +749,9 @@ def _ExtractImagePatchesGrad(op, grad):
   pad_rows, pad_cols = max(0, pad_rows), max(0, pad_cols)
 
   grad_expanded = array_ops.transpose(
-    array_ops.reshape(grad, (batch_size, rows_out,
-                             cols_out, ksize_r, ksize_c, channels)),
-    (1, 2, 3, 4, 0, 5)
-  )
+      array_ops.reshape(
+          grad, (batch_size, rows_out, cols_out, ksize_r, ksize_c, channels)),
+      (1, 2, 3, 4, 0, 5))
   grad_flat = array_ops.reshape(grad_expanded, (-1, batch_size * channels))
 
   row_steps = range(0, rows_out * stride_r, stride_r)
@@ -759,29 +763,21 @@ def _ExtractImagePatchesGrad(op, grad):
       r_low, c_low = row_steps[i] - pad_rows, col_steps[j] - pad_cols
       r_high, c_high = r_low + ksize_r_eff, c_low + ksize_c_eff
 
-      idx.extend([(r * (cols_in) + c,
-                   i * (cols_out * ksize_r * ksize_c) +
-                   j * (ksize_r * ksize_c) +
-                   ri * (ksize_c) + ci)
+      idx.extend([(r * (cols_in) + c, i * (cols_out * ksize_r * ksize_c) + j *
+                   (ksize_r * ksize_c) + ri * (ksize_c) + ci)
                   for (ri, r) in enumerate(range(r_low, r_high, rate_r))
                   for (ci, c) in enumerate(range(c_low, c_high, rate_c))
-                  if 0 <= r and r < rows_in and 0 <= c and c < cols_in
-      ])
+                  if 0 <= r and r < rows_in and 0 <= c and c < cols_in])
 
-  sp_shape = (rows_in * cols_in,
-              rows_out * cols_out * ksize_r * ksize_c)
+  sp_shape = (rows_in * cols_in, rows_out * cols_out * ksize_r * ksize_c)
 
   sp_mat = sparse_tensor.SparseTensor(
-    array_ops.constant(idx, dtype=ops.dtypes.int64),
-    array_ops.ones((len(idx),), dtype=ops.dtypes.float32),
-    sp_shape
-  )
+      array_ops.constant(idx, dtype=ops.dtypes.int64),
+      array_ops.ones((len(idx),), dtype=ops.dtypes.float32), sp_shape)
 
   jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
 
-  grad_out = array_ops.reshape(
-    jac, (rows_in, cols_in, batch_size, channels)
-  )
+  grad_out = array_ops.reshape(jac, (rows_in, cols_in, batch_size, channels))
   grad_out = array_ops.transpose(grad_out, (2, 0, 1, 3))
 
   return [grad_out]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 24a0c186198c7389af9add64ec6466b1f3d2afbd..9541b097a94466861a83cb48ed3111563490cfba 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -34,6 +34,7 @@ See the @{$python/array_ops} guide.
 @@reshape
 @@squeeze
 @@expand_dims
+@@unravel_index
 @@meshgrid
 @@slice
 @@strided_slice
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index d379eccc20dcd63255ee8c2dbe3fbd3e6a9077af..49191c647d59691a59aa5d7dd9cc9dac285b9fea 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Control Flow Operations.
 
 See the @{$python/control_flow_ops} guide.
@@ -84,7 +83,6 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
-
 # We override the 'tuple' for a control flow op, so we keep python's
 # existing 'tuple' for later use in this module.
 _basetuple = tuple
@@ -156,9 +154,10 @@ def Assert(condition, data, summarize=None, name=None):
       xs = ops.convert_n_to_tensor(data)
       data_str = [_summarize_eager(x, summarize) for x in xs]
       raise errors.InvalidArgumentError(
-          node_def=None, op=None,
-          message="Expected '%s' to be true. Summarized data: %s" % (
-              condition, "\n".join(data_str)))
+          node_def=None,
+          op=None,
+          message="Expected '%s' to be true. Summarized data: %s" %
+          (condition, "\n".join(data_str)))
     return
 
   with ops.name_scope(name, "Assert", [condition, data]) as name:
@@ -167,15 +166,15 @@ def Assert(condition, data, summarize=None, name=None):
       # As a simple heuristic, we assume that string and int32 are
       # on host to avoid the need to use cond. If it is not case,
       # we will pay the price copying the tensor to host memory.
-      return gen_logging_ops._assert(
-          condition, data, summarize, name="Assert")
+      return gen_logging_ops._assert(condition, data, summarize, name="Assert")
     else:
       condition = ops.convert_to_tensor(condition, name="Condition")
+
       def true_assert():
         return gen_logging_ops._assert(
             condition, data, summarize, name="Assert")
-      guarded_assert = cond(
-          condition, no_op, true_assert, name="AssertGuard")
+
+      guarded_assert = cond(condition, no_op, true_assert, name="AssertGuard")
       if context.in_eager_mode():
         return
       return guarded_assert.op
@@ -215,7 +214,7 @@ def _Identity(data, name=None):
 def _NextIteration(data, name=None):
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
-    if data.dtype._is_ref_dtype:   # pylint: disable=protected-access
+    if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return ref_next_iteration(data, name=name)
     else:
       return next_iteration(data, name=name)
@@ -234,8 +233,13 @@ def _NextIteration(data, name=None):
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
-def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
-           use_ref=True, use_input_shape=True, name=None):
+def _Enter(data,
+           frame_name,
+           is_constant=False,
+           parallel_iterations=10,
+           use_ref=True,
+           use_input_shape=True,
+           name=None):
   """Creates or finds a child frame, and makes `data` available to it.
 
   The unique `frame_name` is used by the `Executor` to identify frames. If
@@ -257,35 +261,51 @@ def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
-      result = ref_enter(data, frame_name, is_constant, parallel_iterations,
-                         name=name)
+      result = ref_enter(
+          data, frame_name, is_constant, parallel_iterations, name=name)
     else:
-      result = enter(data, frame_name, is_constant, parallel_iterations,
-                     name=name)
+      result = enter(
+          data, frame_name, is_constant, parallel_iterations, name=name)
     if use_input_shape:
       result.set_shape(data.get_shape())
     return result
   else:
     if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(data))
-    values = _Enter(data.values, frame_name, is_constant,
-                    parallel_iterations=parallel_iterations,
-                    use_input_shape=use_input_shape, name=name)
-    indices = enter(data.indices, frame_name, is_constant,
-                    parallel_iterations, name="indices")
+    values = _Enter(
+        data.values,
+        frame_name,
+        is_constant,
+        parallel_iterations=parallel_iterations,
+        use_input_shape=use_input_shape,
+        name=name)
+    indices = enter(
+        data.indices,
+        frame_name,
+        is_constant,
+        parallel_iterations,
+        name="indices")
     if use_input_shape:
       indices.set_shape(data.indices.get_shape())
     if isinstance(data, ops.IndexedSlices):
       dense_shape = data.dense_shape
       if dense_shape is not None:
-        dense_shape = enter(dense_shape, frame_name, is_constant,
-                            parallel_iterations, name="dense_shape")
+        dense_shape = enter(
+            dense_shape,
+            frame_name,
+            is_constant,
+            parallel_iterations,
+            name="dense_shape")
         if use_input_shape:
           dense_shape.set_shape(data.dense_shape.get_shape())
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = enter(data.dense_shape, frame_name, is_constant,
-                          parallel_iterations, name="dense_shape")
+      dense_shape = enter(
+          data.dense_shape,
+          frame_name,
+          is_constant,
+          parallel_iterations,
+          name="dense_shape")
       if use_input_shape:
         dense_shape.set_shape(data.dense_shape.get_shape())
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
@@ -444,8 +464,10 @@ def merge(inputs, name=None):
   if any([inp is None for inp in inputs]):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
-    inputs = [ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
-              for inp in inputs]
+    inputs = [
+        ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
+        for inp in inputs
+    ]
     if all([isinstance(v, ops.Tensor) for v in inputs]):
       if all([v.dtype._is_ref_dtype for v in inputs]):  # pylint: disable=protected-access
         return gen_control_flow_ops._ref_merge(inputs, name)
@@ -475,6 +497,8 @@ def merge(inputs, name=None):
       else:
         dense_shape = None
       return ops.IndexedSlices(values, indices, dense_shape), chosen_index
+
+
 # pylint: enable=protected-access
 
 
@@ -488,7 +512,9 @@ def _convert_tensorarray_to_flow(tensor_or_tensor_array):
 def _make_tensor_array(ta, t_or_flow):
   # pylint: disable=protected-access
   new_ta = tensor_array_ops.TensorArray(
-      dtype=ta.dtype, handle=ta.handle, flow=t_or_flow,
+      dtype=ta.dtype,
+      handle=ta.handle,
+      flow=t_or_flow,
       infer_shape=ta._infer_shape,
       colocate_with_first_write_call=ta._colocate_with_first_write_call)
   new_ta._colocate_with = ta._colocate_with
@@ -500,13 +526,13 @@ def _make_tensor_array(ta, t_or_flow):
 def _convert_flows_to_tensorarrays(tensors_or_tensorarrays, tensors_or_flows):
   if len(tensors_or_tensorarrays) != len(tensors_or_flows):
     raise ValueError(
-        "Lengths of original Tensor list and new list do not match: %d vs. %d"
-        % (len(tensors_or_tensorarrays), len(tensors_or_flows)))
+        "Lengths of original Tensor list and new list do not match: %d vs. %d" %
+        (len(tensors_or_tensorarrays), len(tensors_or_flows)))
   return [
       _make_tensor_array(ta, t_or_flow)
-      if isinstance(ta, tensor_array_ops.TensorArray)
-      else t_or_flow
-      for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)]
+      if isinstance(ta, tensor_array_ops.TensorArray) else t_or_flow
+      for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)
+  ]
 
 
 def _ShapeLessThanOrEqual(shape1, shape2):
@@ -545,8 +571,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
         raise ValueError(
             "The shape invariant specified for %s is not compatible with "
             "the initial shape of the loop variable. It enters the loop "
-            "with shape %s, but the specified shape invariant is %s."
-            % (inp.name, inp.get_shape(), shape))
+            "with shape %s, but the specified shape invariant is %s." %
+            (inp.name, inp.get_shape(), shape))
       var.set_shape(shape)
     else:
       if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
@@ -557,8 +583,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
               "The shape invariant specified for %s is not compatible with "
               "the initial shape of the values tensor of this IndexedSlices. "
               "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s."
-              % (inp.values.name, inp.values.get_shape(), shape))
+              "invariant is %s." % (inp.values.name, inp.values.get_shape(),
+                                    shape))
         var.values.set_shape(shape)
         var.indices.set_shape(tensor_shape.TensorShape([shape[0]]))
         if var.dense_shape is not None:
@@ -569,8 +595,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
               "The shape invariant specified for %s is not compatible with "
               "the initial shape of the shape tensor of this SparseTensor. "
               "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s."
-              % (inp.dense_shape.name, inp.dense_shape.get_shape(), shape))
+              "invariant is %s." % (inp.dense_shape.name,
+                                    inp.dense_shape.get_shape(), shape))
         var.values.set_shape(tensor_shape.TensorShape([None]))
         var.indices.set_shape(tensor_shape.TensorShape([None, shape.ndims]))
         var.dense_shape.set_shape(shape)
@@ -599,8 +625,8 @@ def _EnforceShapeInvariant(merge_var, next_var):
           "The shape for %s is not an invariant for the loop. It enters "
           "the loop with shape %s, but has shape %s after one iteration. "
           "Provide shape invariants using either the `shape_invariants` "
-          "argument of tf.while_loop or set_shape() on the loop variables."
-          % (merge_var.name, m_shape, n_shape))
+          "argument of tf.while_loop or set_shape() on the loop variables." %
+          (merge_var.name, m_shape, n_shape))
   else:
     if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(var))
@@ -623,9 +649,9 @@ def _EnforceShapeInvariant(merge_var, next_var):
               "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
               "after one iteration. Provide shape invariants using either the "
               "`shape_invariants` argument of tf.while_loop or set_shape() "
-              "on the loop variables."
-              % (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-                 n_values_shape, n_indices_shape, n_shape_shape))
+              "on the loop variables." %
+              (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
+               n_values_shape, n_indices_shape, n_shape_shape))
     else:
       m_values_shape = merge_var.values.get_shape()
       m_indices_shape = merge_var.indices.get_shape()
@@ -637,12 +663,12 @@ def _EnforceShapeInvariant(merge_var, next_var):
           not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape) or
           not _ShapeLessThanOrEqual(n_shape_shape, m_shape_shape)):
         raise ValueError(
-          "The shape for %s is not an invariant for the loop. It enters "
-          "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-          "after one iteration. Provide shape invariants using either "
-          "the `shape_invariants` argument of tf.while_loop or set_shape() "
-          "on the loop variables."
-          % (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
+            "The shape for %s is not an invariant for the loop. It enters "
+            "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
+            "after one iteration. Provide shape invariants using either "
+            "the `shape_invariants` argument of tf.while_loop or set_shape() "
+            "on the loop variables." %
+            (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
              n_values_shape, n_indices_shape, n_shape_shape))
 
 
@@ -657,7 +683,7 @@ def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
       # the types don't match.
       # TODO(skyewm): call this for other cases below (needs testing)
       _EnforceShapeInvariant(m, v)
-    m.op._update_input(1, v)   # pylint: disable=protected-access
+    m.op._update_input(1, v)  # pylint: disable=protected-access
   elif isinstance(m, ops.IndexedSlices):
     # pylint: disable=protected-access
     v = math_ops._as_indexed_slices(v, optimize=False)
@@ -720,8 +746,7 @@ def GetMaxSizeFromNestedMaximumIterations(value, while_ctxt):
       raise ValueError(
           "Cannot create a gradient accumulator for tensor '%s' inside "
           "XLA while_loop because maximum_iterations was not passed to "
-          "the tf.while_loop call ('%s')."
-          % (value_name, while_ctxt.name))
+          "the tf.while_loop call ('%s')." % (value_name, while_ctxt.name))
 
     # pylint: disable=protected-access
     max_iter_ctxt = max_iter.op._get_control_flow_context()
@@ -742,9 +767,9 @@ def GetMaxSizeFromNestedMaximumIterations(value, while_ctxt):
             "while_loop. maximum_iterations tensor '%s' for while_loop context "
             "'%s' must be statically known (e.g. a constant value or known "
             "shape dimension), or be defined at or outside the while loop "
-            "context '%s' (currently defined in '%s')." % (
-                value_name, max_iter.name, while_ctxt.name,
-                curr_ctxt_name, max_iter_ctxt.name))
+            "context '%s' (currently defined in '%s')." %
+            (value_name, max_iter.name, while_ctxt.name, curr_ctxt_name,
+             max_iter_ctxt.name))
       max_size *= const_max_iter
 
     # Find the next outer WhileContext (or stop if we reach the
@@ -808,9 +833,11 @@ class GradLoopState(object):
       outer_forward_ctxt = forward_ctxt.outer_context
 
     # Add the forward loop counter.
-    if outer_forward_ctxt: outer_forward_ctxt.Enter()
+    if outer_forward_ctxt:
+      outer_forward_ctxt.Enter()
     cnt, forward_index = forward_ctxt.AddForwardLoopCounter(outer_grad_state)
-    if outer_forward_ctxt: outer_forward_ctxt.Exit()
+    if outer_forward_ctxt:
+      outer_forward_ctxt.Exit()
     self._forward_context = forward_ctxt
     self._forward_index = forward_index
 
@@ -835,7 +862,8 @@ class GradLoopState(object):
           real_cnt, outer_grad_state)
       outer_grad_ctxt.Exit()
     else:
-      if outer_forward_ctxt: outer_forward_ctxt.Enter()
+      if outer_forward_ctxt:
+        outer_forward_ctxt.Enter()
       self._grad_context = WhileContext(
           maximum_iterations=forward_ctxt.maximum_iterations,
           parallel_iterations=forward_ctxt.parallel_iterations,
@@ -845,7 +873,8 @@ class GradLoopState(object):
           grad_state=self)
       self._grad_index = self._grad_context.AddBackpropLoopCounter(
           cnt, outer_grad_state)
-      if outer_forward_ctxt: outer_forward_ctxt.Exit()
+      if outer_forward_ctxt:
+        outer_forward_ctxt.Exit()
 
   @property
   def outer_grad_state(self):
@@ -973,7 +1002,8 @@ class GradLoopState(object):
     # curr_ctxt is the context that tf.gradients was called in.
     curr_ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
     with ops.control_dependencies(None):
-      if curr_ctxt: curr_ctxt.Enter()
+      if curr_ctxt:
+        curr_ctxt.Enter()
       with ops.colocate_with(value):
         # We only need to pass maximum_iterations to the stack if
         # we're inside an XLA context.
@@ -984,11 +1014,10 @@ class GradLoopState(object):
               value, self.forward_context)
         # pylint: disable=protected-access
         acc = gen_data_flow_ops._stack_v2(
-            max_size=max_size,
-            elem_type=value.dtype.base_dtype,
-            name="f_acc")
+            max_size=max_size, elem_type=value.dtype.base_dtype, name="f_acc")
         # pylint: enable=protected-access
-      if curr_ctxt: curr_ctxt.Exit()
+      if curr_ctxt:
+        curr_ctxt.Exit()
 
       # Make acc available in the forward context.
       enter_acc = self.forward_context.AddValue(acc)
@@ -1009,8 +1038,7 @@ class GradLoopState(object):
       else:
         # value is in a cond context within the forward context.
         if not isinstance(value_ctxt, CondContext):
-          raise TypeError(
-              "value_ctxt is not a CondContext: %s" % value_ctxt)
+          raise TypeError("value_ctxt is not a CondContext: %s" % value_ctxt)
         if dead_branch:
           # The special case for creating a zero tensor for a dead
           # branch of a switch. See ControlFlowState.ZerosLike().
@@ -1134,8 +1162,8 @@ class GradLoopState(object):
 
       if real_value is None:
         # Add the stack pop op in the grad context.
-        real_value = cur_grad_state.AddBackpropAccumulatedValue(history_value,
-                                                                cur_value)
+        real_value = cur_grad_state.AddBackpropAccumulatedValue(
+            history_value, cur_value)
         if cur_grad_state != self:
           real_value = self._grad_context.AddValue(real_value)
       self._history_map[value.name] = real_value
@@ -1154,7 +1182,7 @@ class ControlFlowState(object):
   """Maintain the mapping from the loops to their grad states."""
 
   def __init__(self):
-    self._map = {}   # maps forward loop context to GradLoopState
+    self._map = {}  # maps forward loop context to GradLoopState
 
   def GetGradState(self, op, before):
     """Return the grad state for this op if it's in a forward loop context."""
@@ -1318,7 +1346,8 @@ class ControlFlowState(object):
     Returns:
       A zero tensor of the same shape of op.outputs[index].
     """
-    if util.IsLoopSwitch(op): return None
+    if util.IsLoopSwitch(op):
+      return None
     dead_branch = util.IsSwitch(op)
     forward_ctxt = _GetWhileContext(op)
     grad_state = self._map.get(forward_ctxt)
@@ -1361,8 +1390,8 @@ class ControlFlowState(object):
       grad_state.grad_context.Enter()
 
       # Create a zero tensor with the right shape.
-      shape = grad_state.AddBackpropAccumulatedValue(
-          history_zeros_shape, zeros_shape, dead_branch)
+      shape = grad_state.AddBackpropAccumulatedValue(history_zeros_shape,
+                                                     zeros_shape, dead_branch)
       result = array_ops.zeros(shape, val.dtype)
     return result
 
@@ -1393,12 +1422,14 @@ class ControlFlowState(object):
           else:
             # Create a zeros in the outer grad context.
             outer_grad_ctxt = grad_state.grad_context.outer_context
-            if outer_grad_ctxt: outer_grad_ctxt.Enter()
+            if outer_grad_ctxt:
+              outer_grad_ctxt.Enter()
             enter_grad_op = b_merge.op.inputs[0].op
             enter_grad = enter_grad_op.inputs[0]
             grad_shape = array_ops.shape_internal(enter_grad, optimize=False)
             grad_val = array_ops.zeros(grad_shape)
-            if outer_grad_ctxt: outer_grad_ctxt.Exit()
+            if outer_grad_ctxt:
+              outer_grad_ctxt.Exit()
             # Use the zeros for iterations > 0.
             grad_state.grad_context.Enter()
             next_grad_val = _NextIteration(grad_val)
@@ -1470,8 +1501,7 @@ class ControlFlowContext(object):
     self._outer_context = ops.get_default_graph()._get_control_flow_context()
     self._context_stack = []
     if values_def:
-      self._init_values_from_proto(values_def,
-                                   import_scope=import_scope)
+      self._init_values_from_proto(values_def, import_scope=import_scope)
     else:
       # Values that have been already seen in this context.
       self._values = set()
@@ -1532,19 +1562,16 @@ class ControlFlowContext(object):
     """
     values_def = control_flow_pb2.ValuesDef()
     values_def.values.extend(
-        [ops.strip_name_scope(v, export_scope)
-         for v in sorted(self._values)])
+        [ops.strip_name_scope(v, export_scope) for v in sorted(self._values)])
     for k, v in self._external_values.items():
       k = ops.strip_name_scope(k, export_scope)
-      values_def.external_values[k] = ops.strip_name_scope(
-          v.name, export_scope)
+      values_def.external_values[k] = ops.strip_name_scope(v.name, export_scope)
     return values_def
 
   @staticmethod
   def _from_proto(values_def, import_scope=None):
     """Returns a `ControlFlowContext` created from `values_def`."""
-    return ControlFlowContext(values_def=values_def,
-                              import_scope=import_scope)
+    return ControlFlowContext(values_def=values_def, import_scope=import_scope)
 
   def AddName(self, name):
     self._values.add(name)
@@ -1599,6 +1626,7 @@ class ControlFlowContext(object):
       op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
     return internal_control_inputs
+
   # pylint: enable=protected-access
 
   def AddInnerOp(self, op):
@@ -1626,8 +1654,13 @@ class ControlFlowContext(object):
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
 
-  def __init__(self, pred=None, pivot=None, branch=None,
-               name="cond_text", context_def=None, import_scope=None):
+  def __init__(self,
+               pred=None,
+               pivot=None,
+               branch=None,
+               name="cond_text",
+               context_def=None,
+               import_scope=None):
     """Creates a `CondContext`.
 
     Args:
@@ -1647,9 +1680,9 @@ class CondContext(ControlFlowContext):
     else:
       # Initializes the default fields.
       ControlFlowContext.__init__(self)
-      self._pred = pred         # The boolean tensor for the cond predicate
-      self._pivot = pivot       # The predicate tensor in this branch
-      self._branch = branch     # 0 or 1 representing this branch
+      self._pred = pred  # The boolean tensor for the cond predicate
+      self._pivot = pivot  # The predicate tensor in this branch
+      self._branch = branch  # 0 or 1 representing this branch
 
       # Values considered to have been already seen in this context.
       self._values.add(pred.name)
@@ -1665,15 +1698,14 @@ class CondContext(ControlFlowContext):
     assert isinstance(context_def, control_flow_pb2.CondContextDef)
     # Create from context_def.
     g = ops.get_default_graph()
-    self._name = ops.prepend_name_scope(
-        context_def.context_name, import_scope)
-    self._pred = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pred_name, import_scope))
-    self._pivot = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_name, import_scope))
+    self._name = ops.prepend_name_scope(context_def.context_name, import_scope)
+    self._pred = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pred_name, import_scope))
+    self._pivot = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_name, import_scope))
     self._branch = context_def.branch
-    super(CondContext, self).__init__(values_def=context_def.values_def,
-                                      import_scope=import_scope)
+    super(CondContext, self).__init__(
+        values_def=context_def.values_def, import_scope=import_scope)
 
   @property
   def pred(self):
@@ -1711,18 +1743,16 @@ class CondContext(ControlFlowContext):
     Returns:
       A `CondContextDef` protocol buffer.
     """
-    if (export_scope is None or
-        self.name.startswith(export_scope)):
+    if (export_scope is None or self.name.startswith(export_scope)):
       context_def = control_flow_pb2.CondContextDef()
-      context_def.context_name = ops.strip_name_scope(
-          self.name, export_scope)
-      context_def.pred_name = ops.strip_name_scope(
-          self._pred.name, export_scope)
-      context_def.pivot_name = ops.strip_name_scope(
-          self._pivot.name, export_scope)
+      context_def.context_name = ops.strip_name_scope(self.name, export_scope)
+      context_def.pred_name = ops.strip_name_scope(self._pred.name,
+                                                   export_scope)
+      context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
+                                                    export_scope)
       context_def.branch = self._branch
-      context_def.values_def.MergeFrom(super(CondContext, self)._to_proto(
-          export_scope))
+      context_def.values_def.MergeFrom(
+          super(CondContext, self)._to_proto(export_scope))
 
       return context_def
     else:
@@ -1731,8 +1761,7 @@ class CondContext(ControlFlowContext):
   @staticmethod
   def from_proto(context_def, import_scope=None):
     """Returns a `CondContext` object created from `context_def`."""
-    return CondContext(context_def=context_def,
-                       import_scope=import_scope)
+    return CondContext(context_def=context_def, import_scope=import_scope)
 
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
@@ -1846,8 +1875,8 @@ class CondContext(ControlFlowContext):
         if original_result is None:
           return no_op(), None
         else:
-          original_result = nest.map_structure(
-              array_ops.identity, original_result)
+          original_result = nest.map_structure(array_ops.identity,
+                                               original_result)
     if original_result is None:
       return None, None
 
@@ -1871,11 +1900,15 @@ def _UnpackIfSingleton(res):
 # pylint: disable=g-doc-args
 @tf_export("cond")
 @deprecation.deprecated_args(
-    None,
-    "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
+    None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
-def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
-         fn1=None, fn2=None):
+def cond(pred,
+         true_fn=None,
+         false_fn=None,
+         strict=False,
+         name=None,
+         fn1=None,
+         fn2=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
   `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
@@ -2044,6 +2077,8 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
     if not strict:
       merges = _UnpackIfSingleton(merges)
     return merges
+
+
 # pylint: enable=g-doc-args
 # pylint: enable=redefined-outer-name
 
@@ -2139,8 +2174,7 @@ class WhileContext(ControlFlowContext):
     assert isinstance(context_def, control_flow_pb2.WhileContextDef)
     # Create from context_def.
     g = ops.get_default_graph()
-    self._name = ops.prepend_name_scope(
-        context_def.context_name, import_scope)
+    self._name = ops.prepend_name_scope(context_def.context_name, import_scope)
     if context_def.maximum_iterations_name:
       self._maximum_iterations = g.as_graph_element(
           ops.prepend_name_scope(context_def.maximum_iterations_name,
@@ -2150,25 +2184,27 @@ class WhileContext(ControlFlowContext):
     self._parallel_iterations = context_def.parallel_iterations
     self._back_prop = context_def.back_prop
     self._swap_memory = context_def.swap_memory
-    self._pivot_for_pred = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_for_pred_name, import_scope))
+    self._pivot_for_pred = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_for_pred_name, import_scope))
     # We use this node to control constants created by the body lambda.
-    self._pivot_for_body = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_for_body_name, import_scope))
+    self._pivot_for_body = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_for_body_name, import_scope))
     # The boolean tensor for loop termination condition. Used in code
     # generation for gradient computation.
     self._pivot = g.as_graph_element(
         ops.prepend_name_scope(context_def.pivot_name, import_scope))
     # The list of exit tensors for loop variables.
-    self._loop_exits = [g.as_graph_element(
-        ops.prepend_name_scope(exit_name, import_scope))
-                        for exit_name in context_def.loop_exit_names]
+    self._loop_exits = [
+        g.as_graph_element(ops.prepend_name_scope(exit_name, import_scope))
+        for exit_name in context_def.loop_exit_names
+    ]
     # The list of enter tensors for loop variables.
-    self._loop_enters = [g.as_graph_element(
-        ops.prepend_name_scope(enter_name, import_scope))
-                         for enter_name in context_def.loop_enter_names]
-    super(WhileContext, self).__init__(values_def=context_def.values_def,
-                                       import_scope=import_scope)
+    self._loop_enters = [
+        g.as_graph_element(ops.prepend_name_scope(enter_name, import_scope))
+        for enter_name in context_def.loop_enter_names
+    ]
+    super(WhileContext, self).__init__(
+        values_def=context_def.values_def, import_scope=import_scope)
 
   @property
   def maximum_iterations(self):
@@ -2219,11 +2255,9 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContextDef` protocol buffer.
     """
-    if (export_scope is None or
-        self.name.startswith(export_scope)):
+    if (export_scope is None or self.name.startswith(export_scope)):
       context_def = control_flow_pb2.WhileContextDef()
-      context_def.context_name = ops.strip_name_scope(
-          self.name, export_scope)
+      context_def.context_name = ops.strip_name_scope(self.name, export_scope)
       context_def.parallel_iterations = self._parallel_iterations
       if self._maximum_iterations is not None:
         context_def.maximum_iterations_name = ops.strip_name_scope(
@@ -2234,17 +2268,16 @@ class WhileContext(ControlFlowContext):
           self._pivot_for_pred.name, export_scope)
       context_def.pivot_for_body_name = ops.strip_name_scope(
           self._pivot_for_body.name, export_scope)
-      context_def.pivot_name = ops.strip_name_scope(
-          self._pivot.name, export_scope)
-      context_def.loop_exit_names.extend(
-          [ops.strip_name_scope(l.name, export_scope)
-           for l in self._loop_exits])
-      context_def.loop_enter_names.extend(
-          [ops.strip_name_scope(l.name, export_scope)
-           for l in self._loop_enters])
+      context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
+                                                    export_scope)
+      context_def.loop_exit_names.extend([
+          ops.strip_name_scope(l.name, export_scope) for l in self._loop_exits
+      ])
+      context_def.loop_enter_names.extend([
+          ops.strip_name_scope(l.name, export_scope) for l in self._loop_enters
+      ])
       context_def.values_def.MergeFrom(
-          super(WhileContext, self)._to_proto(
-              export_scope=export_scope))
+          super(WhileContext, self)._to_proto(export_scope=export_scope))
 
       return context_def
     else:
@@ -2261,8 +2294,7 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContext` Python object.
     """
-    return WhileContext(context_def=context_def,
-                        import_scope=import_scope)
+    return WhileContext(context_def=context_def, import_scope=import_scope)
 
   def GetWhileContext(self):
     return self
@@ -2299,8 +2331,11 @@ class WhileContext(ControlFlowContext):
         result = self._outer_context.AddValue(val)
       # Create an Enter to make `result` known to this loop context.
       with ops.control_dependencies(None):
-        enter = _Enter(result, self._name, is_constant=True,
-                       parallel_iterations=self._parallel_iterations)
+        enter = _Enter(
+            result,
+            self._name,
+            is_constant=True,
+            parallel_iterations=self._parallel_iterations)
         enter.graph.prevent_feeding(enter)
         if self._outer_context:
           self._outer_context.AddInnerOp(enter.op)
@@ -2378,6 +2413,7 @@ class WhileContext(ControlFlowContext):
 
   def _MaybeAddControlDependency(self, op):
     """Add a control input to the op if it only depends on loop invariants."""
+
     def _IsOpFree(op):
       """Determines if `op` needs a control dependency."""
       if op.control_inputs:
@@ -2390,6 +2426,7 @@ class WhileContext(ControlFlowContext):
         if not util.IsLoopConstantEnter(x.op):
           return False
       return True
+
     if _IsOpFree(op):
       # pylint: disable=protected-access
       op._add_control_input(self.GetControlPivot().op)
@@ -2423,9 +2460,12 @@ class WhileContext(ControlFlowContext):
 
     self.Enter()
     self.AddName(n.name)
-    enter_n = _Enter(n, self._name, is_constant=False,
-                     parallel_iterations=self._parallel_iterations,
-                     name="f_count")
+    enter_n = _Enter(
+        n,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="f_count")
     self.loop_enters.append(enter_n)
 
     merge_n = merge([enter_n, enter_n])[0]
@@ -2465,9 +2505,12 @@ class WhileContext(ControlFlowContext):
 
     self.Enter()
     self.AddName(count.name)
-    enter_count = _Enter(count, self._name, is_constant=False,
-                         parallel_iterations=self._parallel_iterations,
-                         name="b_count")
+    enter_count = _Enter(
+        count,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="b_count")
     self.loop_enters.append(enter_count)
 
     merge_count = merge([enter_count, enter_count])[0]
@@ -2525,9 +2568,11 @@ class WhileContext(ControlFlowContext):
     # without running any iterations.
     shape = grad.get_shape()
     if shape.is_fully_defined():
-      if self.outer_context: self.outer_context.Enter()
+      if self.outer_context:
+        self.outer_context.Enter()
       acc = constant_op.constant(0, grad.dtype, shape=shape, name="b_acc")
-      if self.outer_context: self.outer_context.Exit()
+      if self.outer_context:
+        self.outer_context.Exit()
     else:
       value = op.inputs[0]
       if (isinstance(self.outer_context, WhileContext) and
@@ -2546,16 +2591,21 @@ class WhileContext(ControlFlowContext):
         acc = array_ops.zeros(real_shape, grad.dtype)
         self.outer_context.Exit()
       else:
-        if self.outer_context: self.outer_context.Enter()
+        if self.outer_context:
+          self.outer_context.Enter()
         zeros_shape = array_ops.shape_internal(value, optimize=False)
         acc = array_ops.zeros(zeros_shape, grad.dtype)
-        if self.outer_context: self.outer_context.Exit()
+        if self.outer_context:
+          self.outer_context.Exit()
 
     self.Enter()
     self.AddName(acc.name)
-    enter_acc = _Enter(acc, self._name, is_constant=False,
-                       parallel_iterations=self._parallel_iterations,
-                       name="b_acc")
+    enter_acc = _Enter(
+        acc,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="b_acc")
     self.loop_enters.append(enter_acc)
 
     merge_acc = merge([enter_acc, enter_acc], name="b_acc")[0]
@@ -2588,14 +2638,17 @@ class WhileContext(ControlFlowContext):
     dense_shape = grad.dense_shape
 
     self.Exit()
-    if self.outer_context: self.outer_context.Enter()
+    if self.outer_context:
+      self.outer_context.Enter()
     if values.get_shape().is_fully_defined():
       values_shape = tensor_shape.TensorShape(
           [tensor_shape.Dimension(1)] + values.get_shape().dims[1:])
-      if self.outer_context: self.outer_context.Enter()
-      values_acc = constant_op.constant(0, values.dtype, shape=values_shape,
-                                        name="b_acc")
-      if self.outer_context: self.outer_context.Exit()
+      if self.outer_context:
+        self.outer_context.Enter()
+      values_acc = constant_op.constant(
+          0, values.dtype, shape=values_shape, name="b_acc")
+      if self.outer_context:
+        self.outer_context.Exit()
     else:
       values_shape = _resource_safe_shape(op.inputs[0])[1:]
       values_shape = array_ops.concat([[1], values_shape], 0)
@@ -2604,16 +2657,19 @@ class WhileContext(ControlFlowContext):
     shape_acc = None
     if dense_shape is not None:
       if dense_shape.get_shape().is_fully_defined():
-        if self.outer_context: self.outer_context.Enter()
-        shape_acc = constant_op.constant(0, dense_shape.dtype,
-                                         shape=dense_shape.get_shape())
-        if self.outer_context: self.outer_context.Exit()
+        if self.outer_context:
+          self.outer_context.Enter()
+        shape_acc = constant_op.constant(
+            0, dense_shape.dtype, shape=dense_shape.get_shape())
+        if self.outer_context:
+          self.outer_context.Exit()
       else:
         shape_acc = array_ops.zeros_like(
             array_ops.shape_internal(op.inputs[0], optimize=False),
             optimize=False)
 
-    if self.outer_context: self.outer_context.Exit()
+    if self.outer_context:
+      self.outer_context.Exit()
 
     self.Enter()
     self.AddName(values_acc.name)
@@ -2626,9 +2682,15 @@ class WhileContext(ControlFlowContext):
     # Set use_input_shape=False since the accumulator tensors will grow in
     # size. If use_input_shape=True, the _update_input call below will result in
     # incompatible shapes.
-    enter_acc = [_Enter(x, self._name, is_constant=False,
-                        parallel_iterations=self._parallel_iterations,
-                        use_input_shape=False, name="b_acc") for x in init_acc]
+    enter_acc = [
+        _Enter(
+            x,
+            self._name,
+            is_constant=False,
+            parallel_iterations=self._parallel_iterations,
+            use_input_shape=False,
+            name="b_acc") for x in init_acc
+    ]
     # Manually set appropriate partial shapes.
     enter_acc[0].set_shape([None])
     if values_acc.shape.dims is not None:
@@ -2645,8 +2707,7 @@ class WhileContext(ControlFlowContext):
     ]
     if shape_acc is not None:
       # For the shape we just keep the maximum
-      acc_indexed_slices.append(
-          math_ops.maximum(dense_shape, switch_acc[2][1]))
+      acc_indexed_slices.append(math_ops.maximum(dense_shape, switch_acc[2][1]))
 
     next_acc = [_NextIteration(x) for x in acc_indexed_slices]
     for xm, xn in zip(merge_acc, next_acc):
@@ -2657,7 +2718,8 @@ class WhileContext(ControlFlowContext):
 
     self.ExitResult(exit_acc)
     return ops.IndexedSlices(
-        indices=exit_acc[0], values=exit_acc[1],
+        indices=exit_acc[0],
+        values=exit_acc[1],
         dense_shape=exit_acc[2] if shape_acc is not None else None)
 
   def _InitializeValues(self, values):
@@ -2690,10 +2752,14 @@ class WhileContext(ControlFlowContext):
     if self._outer_context:
       real_vars = [self._outer_context.AddValue(x) for x in loop_vars]
     with ops.control_dependencies(None):
-      enter_vars = [_Enter(x, self._name, is_constant=False,
-                           parallel_iterations=self._parallel_iterations,
-                           use_input_shape=(shape_invariants is None))
-                    for x in real_vars]
+      enter_vars = [
+          _Enter(
+              x,
+              self._name,
+              is_constant=False,
+              parallel_iterations=self._parallel_iterations,
+              use_input_shape=(shape_invariants is None)) for x in real_vars
+      ]
       for x in enter_vars:
         x.graph.prevent_feeding(x)
         if self._outer_context:
@@ -2754,11 +2820,13 @@ class WhileContext(ControlFlowContext):
       summary_ref = ops.get_collection_ref(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
       summary_ref[:] = pre_summaries
       with ops.control_dependencies(new_summaries):
+
         def map_fn(x):
           # TODO(apassos) figure out how to trigger with tensor arrays as well
           if isinstance(x, tensor_array_ops.TensorArray):
             return x
           return array_ops.identity(x)
+
         body_result = nest.map_structure(map_fn, body_result)
 
     # Compare the structure types of input and output of body.
@@ -2815,8 +2883,7 @@ class WhileContext(ControlFlowContext):
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
         flat_sequence=exit_vars_with_tensor_arrays)
-    return (packed_exit_vars[0] if len(exit_vars) == 1
-            else packed_exit_vars)
+    return (packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars)
 
   def _FixControlInputsAndContext(self, enters):
     graph = ops.get_default_graph()
@@ -2834,8 +2901,9 @@ class WhileContext(ControlFlowContext):
       for x in xs:
         inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
-        outer_control_inputs = [op for op in control_inputs
-                                if self._IsInOuterContext(op)]
+        outer_control_inputs = [
+            op for op in control_inputs if self._IsInOuterContext(op)
+        ]
         x.op._set_control_flow_context(self)
         x.op._add_control_inputs(outer_control_inputs)
         graph._record_op_seen_by_control_dependencies(x.op)
@@ -2847,9 +2915,15 @@ class WhileContext(ControlFlowContext):
 
 # pylint: disable=redefined-outer-name
 @tf_export("while_loop")
-def while_loop(cond, body, loop_vars, shape_invariants=None,
-               parallel_iterations=10, back_prop=True, swap_memory=False,
-               name=None, maximum_iterations=None):
+def while_loop(cond,
+               body,
+               loop_vars,
+               shape_invariants=None,
+               parallel_iterations=10,
+               back_prop=True,
+               swap_memory=False,
+               name=None,
+               maximum_iterations=None):
   """Repeat `body` while the condition `cond` is true.
 
   `cond` is a callable returning a boolean scalar tensor. `body` is a callable
@@ -3024,6 +3098,8 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
       return result[1]
     else:
       return result
+
+
 # pylint: enable=redefined-outer-name
 
 
@@ -3051,8 +3127,9 @@ def _AsTensorList(x, p):
     if isinstance(v, ops.Tensor):
       l.append(array_ops.identity(v))
     else:
-      l.append(ops.IndexedSlices(array_ops.identity(v.values),
-                                 array_ops.identity(v.indices)))
+      l.append(
+          ops.IndexedSlices(
+              array_ops.identity(v.values), array_ops.identity(v.indices)))
   return l
 
 
@@ -3062,8 +3139,7 @@ def _CheckResults(a, b):
   for x, y in zip(a, b):
     assert x.dtype == y.dtype, (
         "Values returned by a() [%s] and b() [%s] must have "
-        "the same type: %s, %s." %
-        (x.name, y.name, x.dtype.name, y.dtype.name))
+        "the same type: %s, %s." % (x.name, y.name, x.dtype.name, y.dtype.name))
 
 
 def with_dependencies(dependencies, output_tensor, name=None):
@@ -3099,9 +3175,9 @@ def with_dependencies(dependencies, output_tensor, name=None):
         if isinstance(output_tensor, ops.Tensor):
           return _Identity(output_tensor, name=name)
         else:
-          return ops.IndexedSlices(_Identity(output_tensor.values, name=name),
-                                   output_tensor.indices,
-                                   output_tensor.dense_shape)
+          return ops.IndexedSlices(
+              _Identity(output_tensor.values, name=name), output_tensor.indices,
+              output_tensor.dense_shape)
 
 
 def _GroupControlDeps(dev, deps, name=None):
@@ -3173,6 +3249,7 @@ def group(*inputs, **kwargs):
     def device_key(dev):
       """A sort key that allows None to be compared to strings."""
       return "" if dev is None else dev
+
     for dev in sorted(six.iterkeys(ops_on_device), key=device_key):
       deps.append(_GroupControlDeps(dev, ops_on_device[dev]))
 
@@ -3463,12 +3540,14 @@ class XLAControlFlowContext(ControlFlowContext):
     return x
 
 
-ops.register_proto_function(ops.GraphKeys.COND_CONTEXT,
-                            proto_type=control_flow_pb2.CondContextDef,
-                            to_proto=CondContext.to_proto,
-                            from_proto=CondContext.from_proto)
+ops.register_proto_function(
+    ops.GraphKeys.COND_CONTEXT,
+    proto_type=control_flow_pb2.CondContextDef,
+    to_proto=CondContext.to_proto,
+    from_proto=CondContext.from_proto)
 
-ops.register_proto_function(ops.GraphKeys.WHILE_CONTEXT,
-                            proto_type=control_flow_pb2.WhileContextDef,
-                            to_proto=WhileContext.to_proto,
-                            from_proto=WhileContext.from_proto)
+ops.register_proto_function(
+    ops.GraphKeys.WHILE_CONTEXT,
+    proto_type=control_flow_pb2.WhileContextDef,
+    to_proto=WhileContext.to_proto,
+    from_proto=WhileContext.from_proto)
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 34f0bf7b78a75533cb89ed549afad90f3c066b94..95e45bff066d4b2653e5de67684a6277006345f2 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #==============================================================================
-
 """Data Flow Operations."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -40,6 +39,7 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
 from tensorflow.python.util.tf_export import tf_export
+
 # pylint: enable=wildcard-import
 
 
@@ -54,17 +54,19 @@ def _as_type_list(dtypes):
     return list(dtypes)
 
 
-def _as_shape_list(shapes, dtypes, unknown_dim_allowed=False,
+def _as_shape_list(shapes,
+                   dtypes,
+                   unknown_dim_allowed=False,
                    unknown_rank_allowed=False):
   """Convert shapes to a list of tuples of int (or None)."""
   del dtypes
   if unknown_dim_allowed:
-    if (not isinstance(shapes, collections.Sequence)
-        or not shapes
-        or any(shape is None or isinstance(shape, int) for shape in shapes)):
+    if (not isinstance(shapes, collections.Sequence) or not shapes or
+        any(shape is None or isinstance(shape, int) for shape in shapes)):
       raise ValueError(
           "When providing partial shapes, a list of shapes must be provided.")
-  if shapes is None: return None
+  if shapes is None:
+    return None
   if isinstance(shapes, tensor_shape.TensorShape):
     shapes = [shapes]
   if not isinstance(shapes, (tuple, list)):
@@ -103,7 +105,8 @@ def _shape_common(s1, s2):
     return tensor_shape.unknown_shape()
   d = [
       d1 if d1 is not None and d1 == d2 else None
-      for (d1, d2) in zip(s1.as_list(), s2.as_list())]
+      for (d1, d2) in zip(s1.as_list(), s2.as_list())
+  ]
   return tensor_shape.TensorShape(d)
 
 
@@ -195,8 +198,7 @@ class QueueBase(object):
       TypeError: When `queues` is not a list of `QueueBase` objects,
         or when the data types of `queues` are not all the same.
     """
-    if ((not queues) or
-        (not isinstance(queues, list)) or
+    if ((not queues) or (not isinstance(queues, list)) or
         (not all(isinstance(x, QueueBase) for x in queues))):
       raise TypeError("A list of queues expected")
 
@@ -210,12 +212,16 @@ class QueueBase(object):
 
     queue_shapes = [q.shapes for q in queues]
     reduced_shapes = [
-        six.moves.reduce(_shape_common, s) for s in zip(*queue_shapes)]
+        six.moves.reduce(_shape_common, s) for s in zip(*queue_shapes)
+    ]
 
     queue_refs = array_ops.stack([x.queue_ref for x in queues])
     selected_queue = array_ops.gather(queue_refs, index)
-    return QueueBase(dtypes=dtypes, shapes=reduced_shapes, names=names,
-                     queue_ref=selected_queue)
+    return QueueBase(
+        dtypes=dtypes,
+        shapes=reduced_shapes,
+        names=names,
+        queue_ref=selected_queue)
 
   @property
   def queue_ref(self):
@@ -282,8 +288,8 @@ class QueueBase(object):
 
     tensors = []
     for i, (val, dtype) in enumerate(zip(vals, self._dtypes)):
-      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
-                                           name="component_%d" % i))
+      tensors.append(
+          ops.convert_to_tensor(val, dtype=dtype, name="component_%d" % i))
 
     return tensors
 
@@ -555,11 +561,13 @@ class QueueBase(object):
       name = "%s_Close" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
       return gen_data_flow_ops._queue_close_v2(
-          self._queue_ref, cancel_pending_enqueues=cancel_pending_enqueues,
+          self._queue_ref,
+          cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
     else:
       return gen_data_flow_ops._queue_close(
-          self._queue_ref, cancel_pending_enqueues=cancel_pending_enqueues,
+          self._queue_ref,
+          cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
 
   def is_closed(self, name=None):
@@ -577,9 +585,9 @@ class QueueBase(object):
     if name is None:
       name = "%s_Is_Closed" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
-      return gen_data_flow_ops.queue_is_closed_v2(self._queue_ref,name=name)
+      return gen_data_flow_ops.queue_is_closed_v2(self._queue_ref, name=name)
     else:
-      return gen_data_flow_ops.queue_is_closed_(self._queue_ref,name=name)
+      return gen_data_flow_ops.queue_is_closed_(self._queue_ref, name=name)
 
   def size(self, name=None):
     """Compute the number of elements in this queue.
@@ -611,8 +619,14 @@ class RandomShuffleQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, min_after_dequeue, dtypes, shapes=None,
-               names=None, seed=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               min_after_dequeue,
+               dtypes,
+               shapes=None,
+               names=None,
+               seed=None,
+               shared_name=None,
                name="random_shuffle_queue"):
     """Create a queue that dequeues elements in a random order.
 
@@ -670,9 +684,14 @@ class RandomShuffleQueue(QueueBase):
       string = (str(seed1) + shared_name).encode("utf-8")
       seed2 = int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
     queue_ref = gen_data_flow_ops._random_shuffle_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        min_after_dequeue=min_after_dequeue, seed=seed1, seed2=seed2,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        min_after_dequeue=min_after_dequeue,
+        seed=seed1,
+        seed2=seed2,
+        shared_name=shared_name,
+        name=name)
 
     super(RandomShuffleQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
@@ -690,8 +709,13 @@ class FIFOQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, dtypes, shapes=None, names=None,
-               shared_name=None, name="fifo_queue"):
+  def __init__(self,
+               capacity,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               name="fifo_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
     A `FIFOQueue` has bounded capacity; supports multiple concurrent
@@ -725,8 +749,11 @@ class FIFOQueue(QueueBase):
     shapes = _as_shape_list(shapes, dtypes)
     names = _as_name_list(names, dtypes)
     queue_ref = gen_data_flow_ops._fifo_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     super(FIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
@@ -747,7 +774,12 @@ class PaddingFIFOQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, dtypes, shapes, names=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               dtypes,
+               shapes,
+               names=None,
+               shared_name=None,
                name="padding_fifo_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
@@ -792,12 +824,15 @@ class PaddingFIFOQueue(QueueBase):
     names = _as_name_list(names, dtypes)
     if len(dtypes) != len(shapes):
       raise ValueError("Shapes must be provided for all components, "
-                       "but received %d dtypes and %d shapes."
-                       % (len(dtypes), len(shapes)))
+                       "but received %d dtypes and %d shapes." % (len(dtypes),
+                                                                  len(shapes)))
 
     queue_ref = gen_data_flow_ops._padding_fifo_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
@@ -815,7 +850,12 @@ class PriorityQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, types, shapes=None, names=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               types,
+               shapes=None,
+               names=None,
+               shared_name=None,
                name="priority_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
@@ -856,14 +896,17 @@ class PriorityQueue(QueueBase):
     shapes = _as_shape_list(shapes, types)
 
     queue_ref = gen_data_flow_ops._priority_queue_v2(
-        component_types=types, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=types,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     priority_dtypes = [_dtypes.int64] + types
     priority_shapes = [()] + shapes if shapes else shapes
 
-    super(PriorityQueue, self).__init__(
-        priority_dtypes, priority_shapes, names, queue_ref)
+    super(PriorityQueue, self).__init__(priority_dtypes, priority_shapes, names,
+                                        queue_ref)
 
 
 # TODO(josh11b): class BatchQueue(QueueBase):
@@ -943,8 +986,10 @@ class Barrier(object):
       self._shapes = [tensor_shape.unknown_shape() for _ in self._types]
 
     self._barrier_ref = gen_data_flow_ops._barrier(
-        component_types=self._types, shapes=self._shapes,
-        shared_name=shared_name, name=name)
+        component_types=self._types,
+        shapes=self._shapes,
+        shared_name=shared_name,
+        name=name)
     if context.in_graph_mode():
       self._name = self._barrier_ref.op.name.split("/")[-1]
     else:
@@ -1028,12 +1073,13 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierTakeMany" % self._name
-    ret = gen_data_flow_ops._barrier_take_many(self._barrier_ref,
-                                               num_elements,
-                                               self._types,
-                                               allow_small_batch,
-                                               timeout,
-                                               name=name)
+    ret = gen_data_flow_ops._barrier_take_many(
+        self._barrier_ref,
+        num_elements,
+        self._types,
+        allow_small_batch,
+        timeout,
+        name=name)
 
     # NOTE(mrry): Not using a shape function because we need access to
     # the Barrier object.
@@ -1048,8 +1094,7 @@ class Barrier(object):
       op.outputs[1].set_shape(tensor_shape.vector(batch_dim))  # keys
       for output, shape in zip(op.outputs[2:], self._shapes):  # value_list
         output.set_shape(
-            tensor_shape.TensorShape([batch_dim]).concatenate(
-                shape))
+            tensor_shape.TensorShape([batch_dim]).concatenate(shape))
 
     return ret
 
@@ -1298,8 +1343,8 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
                name="sparse_conditional_accumulator"):
     accumulator_ref = gen_data_flow_ops.sparse_conditional_accumulator(
         dtype=dtype, shape=shape, shared_name=shared_name, name=name)
-    super(SparseConditionalAccumulator,
-          self).__init__(dtype, shape, accumulator_ref)
+    super(SparseConditionalAccumulator, self).__init__(dtype, shape,
+                                                       accumulator_ref)
 
   def apply_indexed_slices_grad(self, grad, local_step=0, name=None):
     """Attempts to apply a gradient to the accumulator.
@@ -1368,8 +1413,8 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         local_step=local_step,
         gradient_indices=math_ops.to_int64(grad_indices),
         gradient_values=grad_values,
-        gradient_shape=math_ops.to_int64([] if grad_shape is None else
-                                         grad_shape),
+        gradient_shape=math_ops.to_int64([]
+                                         if grad_shape is None else grad_shape),
         has_known_shape=(grad_shape is not None),
         name=name)
 
@@ -1431,11 +1476,16 @@ class BaseStagingArea(object):
   _identifier = 0
   _lock = threading.Lock()
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                  capacity=0, memory_limit=0):
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               capacity=0,
+               memory_limit=0):
     if shared_name is None:
-      self._name = (ops.get_default_graph()
-                       .unique_name(self.__class__.__name__))
+      self._name = (
+          ops.get_default_graph().unique_name(self.__class__.__name__))
     elif isinstance(shared_name, six.string_types):
       self._name = shared_name
     else:
@@ -1532,8 +1582,9 @@ class BaseStagingArea(object):
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals, indices, n = zip(*[(vals[k], i, k) for i, k in enumerate(self._names)
-                                                  if k in vals])
+      vals, indices, n = zip(*[(vals[k], i, k)
+                               for i, k in enumerate(self._names)
+                               if k in vals])
     else:
       if self._names:
         raise ValueError("You must enqueue a dictionary in a staging area "
@@ -1541,7 +1592,7 @@ class BaseStagingArea(object):
 
       if indices is None:
         raise ValueError("Indices must be supplied when inserting a list "
-                        "of tensors")
+                         "of tensors")
 
       if len(indices) != len(vals):
         raise ValueError("Number of indices '%s' doesn't match "
@@ -1553,8 +1604,8 @@ class BaseStagingArea(object):
 
     # Sanity check number of values
     if not len(vals) <= len(self._dtypes):
-      raise ValueError("Unexpected number of inputs '%s' vs '%s'" % (
-                          len(vals), len(self._dtypes)))
+      raise ValueError("Unexpected number of inputs '%s' vs '%s'" %
+                       (len(vals), len(self._dtypes)))
 
     tensors = []
 
@@ -1562,14 +1613,14 @@ class BaseStagingArea(object):
       dtype, shape = self._dtypes[i], self._shapes[i]
       # Check dtype
       if not val.dtype == dtype:
-        raise ValueError("Datatypes do not match. '%s' != '%s'" %(
-                        str(val.dtype), str(dtype)))
+        raise ValueError("Datatypes do not match. '%s' != '%s'" %
+                         (str(val.dtype), str(dtype)))
 
       # Check shape
       val.get_shape().assert_is_compatible_with(shape)
 
-      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
-                                          name="component_%d" % i))
+      tensors.append(
+          ops.convert_to_tensor(val, dtype=dtype, name="component_%d" % i))
 
     return tensors, indices
 
@@ -1632,6 +1683,7 @@ class BaseStagingArea(object):
     else:
       return [vals]
 
+
 class StagingArea(BaseStagingArea):
   """Class for staging inputs. No ordering guarantees.
 
@@ -1666,8 +1718,13 @@ class StagingArea(BaseStagingArea):
 
   """
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                  capacity=0, memory_limit=0):
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               capacity=0,
+               memory_limit=0):
     """Constructs a staging area object.
 
     The two optional lists, `shapes` and `names`, must be of the same length
@@ -1702,9 +1759,8 @@ class StagingArea(BaseStagingArea):
       ValueError: If one of the arguments is invalid.
     """
 
-    super(StagingArea, self).__init__(dtypes, shapes,
-                                          names, shared_name,
-                                          capacity, memory_limit)
+    super(StagingArea, self).__init__(dtypes, shapes, names, shared_name,
+                                      capacity, memory_limit)
 
   def put(self, values, name=None):
     """Create an op that places a value into the staging area.
@@ -1726,14 +1782,18 @@ class StagingArea(BaseStagingArea):
                         self._scope_vals(values)) as scope:
 
       # Hard-code indices for this staging area
-      indices = (list(six.moves.range(len(values)))
-                  if isinstance(values, (list, tuple)) else None)
+      indices = (
+          list(six.moves.range(len(values)))
+          if isinstance(values, (list, tuple)) else None)
       vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
-        op = gen_data_flow_ops.stage(values=vals, shared_name=self._name,
-                                     name=scope, capacity=self._capacity,
-                                     memory_limit=self._memory_limit)
+        op = gen_data_flow_ops.stage(
+            values=vals,
+            shared_name=self._name,
+            name=scope,
+            capacity=self._capacity,
+            memory_limit=self._memory_limit)
 
       return op
 
@@ -1741,7 +1801,7 @@ class StagingArea(BaseStagingArea):
     with ops.colocate_with(self._coloc_op):
       ret = get_fn()
 
-    indices = list(six.moves.range(len(self._dtypes))) # Hard coded
+    indices = list(six.moves.range(len(self._dtypes)))  # Hard coded
     return self._get_return_value(ret, indices)
 
   def get(self, name=None):
@@ -1769,10 +1829,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_get" % self._name
 
+    # pylint: disable=bad-continuation
     fn = lambda: gen_data_flow_ops.unstage(dtypes=self._dtypes,
                     shared_name=self._name, name=name,
                     capacity=self._capacity,
                     memory_limit=self._memory_limit)
+    # pylint: enable=bad-continuation
 
     return self.__internal_get(fn, name)
 
@@ -1797,10 +1859,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_peek" % self._name
 
+    # pylint: disable=bad-continuation
     fn = lambda: gen_data_flow_ops.stage_peek(index,
                     dtypes=self._dtypes, shared_name=self._name,
                     name=name, capacity=self._capacity,
                     memory_limit=self._memory_limit)
+    # pylint: enable=bad-continuation
 
     return self.__internal_get(fn, name)
 
@@ -1816,9 +1880,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_size" % self._name
 
-    return gen_data_flow_ops.stage_size(name=name, shared_name=self._name,
-                        dtypes=self._dtypes, capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return gen_data_flow_ops.stage_size(
+        name=name,
+        shared_name=self._name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def clear(self, name=None):
     """Clears the staging area.
@@ -1832,14 +1899,16 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_clear" % self._name
 
-    return gen_data_flow_ops.stage_clear(name=name, shared_name=self._name,
-                        dtypes=self._dtypes, capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return gen_data_flow_ops.stage_clear(
+        name=name,
+        shared_name=self._name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
+
 
 class MapStagingArea(BaseStagingArea):
-  """
-  A `MapStagingArea` is a TensorFlow data structure that stores tensors across
-  multiple steps, and exposes operations that can put and get tensors.
+  """A `MapStagingArea` is a TensorFlow data structure that stores tensors across multiple steps, and exposes operations that can put and get tensors.
 
   Each `MapStagingArea` element is a (key, value) pair.
   Only int64 keys are supported, other types should be
@@ -1852,7 +1921,8 @@ class MapStagingArea(BaseStagingArea):
   It supports multiple concurrent producers and consumers; and
   provides exactly-once delivery.
 
-  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors whose
+  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors
+  whose
   dtypes are described by `dtypes`, and whose shapes are optionally described
   by the `shapes` argument.
 
@@ -1896,10 +1966,16 @@ class MapStagingArea(BaseStagingArea):
   associated with it are removed.
   """
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                      ordered=False, capacity=0, memory_limit=0):
-    """
-    Args:
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               ordered=False,
+               capacity=0,
+               memory_limit=0):
+    """Args:
+
       dtypes:  A list of types.  The length of dtypes must equal the number
         of tensors in each element.
       capacity: (Optional.) Maximum number of elements.
@@ -1925,9 +2001,8 @@ class MapStagingArea(BaseStagingArea):
 
     """
 
-    super(MapStagingArea, self).__init__(dtypes, shapes,
-                                      names, shared_name,
-                                      capacity, memory_limit)
+    super(MapStagingArea, self).__init__(dtypes, shapes, names, shared_name,
+                                         capacity, memory_limit)
 
     # Defer to different methods depending if the map is ordered
     self._ordered = ordered
@@ -1950,8 +2025,7 @@ class MapStagingArea(BaseStagingArea):
       self._clear_fn = gen_data_flow_ops.map_clear
 
   def put(self, key, vals, indices=None, name=None):
-    """
-    Create an op that stores the (key, vals) pair in the staging area.
+    """Create an op that stores the (key, vals) pair in the staging area.
 
     Incomplete puts are possible, preferably using a dictionary for vals
     as the appropriate dtypes and shapes can be inferred from the value names
@@ -1973,7 +2047,8 @@ class MapStagingArea(BaseStagingArea):
         The created op
 
     Raises:
-        ValueError: If the number or type of inputs don't match the staging area.
+        ValueError: If the number or type of inputs don't match the staging
+        area.
     """
 
     with ops.name_scope(name, "%s_put" % self._name,
@@ -1982,10 +2057,15 @@ class MapStagingArea(BaseStagingArea):
       vals, indices = self._check_put_dtypes(vals, indices)
 
       with ops.colocate_with(self._coloc_op):
-        op = self._put_fn(key, indices, vals, dtypes=self._dtypes,
-                             shared_name=self._name, name=scope,
-                             capacity=self._capacity,
-                             memory_limit=self._memory_limit)
+        op = self._put_fn(
+            key,
+            indices,
+            vals,
+            dtypes=self._dtypes,
+            shared_name=self._name,
+            name=scope,
+            capacity=self._capacity,
+            memory_limit=self._memory_limit)
     return op
 
   def _get_indices_and_dtypes(self, indices=None):
@@ -2001,13 +2081,13 @@ class MapStagingArea(BaseStagingArea):
     if all(isinstance(i, str) for i in indices):
       if self._names is None:
         raise ValueError("String indices provided '%s', but this Staging Area "
-                        "was not created with names." % indices)
+                         "was not created with names." % indices)
 
       try:
         indices = [self._names.index(n) for n in indices]
       except ValueError:
         raise ValueError("Named index '%s' not in "
-                        "Staging Area names '%s'" % (n, self._names))
+                         "Staging Area names '%s'" % (n, self._names))
     elif all(isinstance(i, int) for i in indices):
       pass
     else:
@@ -2018,10 +2098,8 @@ class MapStagingArea(BaseStagingArea):
 
     return indices, dtypes
 
-
   def peek(self, key, indices=None, name=None):
-    """
-    Peeks at staging area data associated with the key.
+    """Peeks at staging area data associated with the key.
 
     If the key is not in the staging area, it will block
     until the associated (key, value) is inserted.
@@ -2044,22 +2122,22 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      result = self._peek_fn(key, shared_name=self._name,
-                      indices=indices,
-                      dtypes=dtypes,
-                      name=name,
-                      capacity=self._capacity,
-                      memory_limit=self._memory_limit)
+      result = self._peek_fn(
+          key,
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     return self._get_return_value(result, indices)
 
   def get(self, key=None, indices=None, name=None):
-    """
-    If the key is provided, the associated (key, value)
-    is returned from the staging area. If the key is not
-    in the staging area, this method will block until
-    the associated (key, value) is inserted.
+    """If the key is provided, the associated (key, value) is returned from the staging area.
 
+    If the key is not in the staging area, this method will block until
+    the associated (key, value) is inserted.
     If no key is provided and the staging area is ordered,
     the (key, value) with the smallest key will be returned.
     Otherwise, a random (key, value) will be returned.
@@ -2084,12 +2162,10 @@ class MapStagingArea(BaseStagingArea):
       return self._pop(key, indices=indices, name=name)
 
   def _pop(self, key, indices=None, name=None):
-    """
-    Remove and return the associated (key, value)
-    is returned from the staging area. If the key is not
-    in the staging area, this method will block until
-    the associated (key, value) is inserted.
+    """Remove and return the associated (key, value) is returned from the staging area.
 
+    If the key is not in the staging area, this method will block until
+    the associated (key, value) is inserted.
     Args:
         key: Key associated with the required data
         indices: Partial list of tensors to retrieve (optional).
@@ -2107,21 +2183,21 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      result = self._pop_fn(key, shared_name=self._name,
-                      indices=indices,
-                      dtypes=dtypes,
-                      name=name,
-                      capacity=self._capacity,
-                      memory_limit=self._memory_limit)
+      result = self._pop_fn(
+          key,
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     return key, self._get_return_value(result, indices)
 
   def _popitem(self, indices=None, name=None):
-    """
-    If the staging area is ordered,
-    the (key, value) with the smallest key will be returned.
-    Otherwise, a random (key, value) will be returned.
+    """If the staging area is ordered, the (key, value) with the smallest key will be returned.
 
+    Otherwise, a random (key, value) will be returned.
     If the staging area is empty when this operation executes,
     it will block until there is an element to dequeue.
 
@@ -2142,12 +2218,13 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      key, result = self._popitem_fn(shared_name=self._name,
-                              indices=indices,
-                              dtypes=dtypes,
-                              name=name,
-                              capacity=self._capacity,
-                              memory_limit=self._memory_limit)
+      key, result = self._popitem_fn(
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     # Separate keys and results out from
     # underlying namedtuple
@@ -2157,8 +2234,7 @@ class MapStagingArea(BaseStagingArea):
     return key, result
 
   def size(self, name=None):
-    """
-    Returns the number of elements in the staging area.
+    """Returns the number of elements in the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2169,14 +2245,15 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_size" % self._name
 
-    return self._size_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return self._size_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def incomplete_size(self, name=None):
-    """
-    Returns the number of incomplete elements in the staging area.
+    """Returns the number of incomplete elements in the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2187,16 +2264,15 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_incomplete_size" % self._name
 
-    return self._incomplete_size_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
-
-
+    return self._incomplete_size_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def clear(self, name=None):
-    """
-    Clears the staging area.
+    """Clears the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2207,10 +2283,12 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_clear" % self._name
 
-    return self._clear_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return self._clear_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
 
 class RecordInput(object):
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 7dbccf1caf1486bb247a1bef0ac37c36adbcc53e..ac03d30fcd2e65f032937d9259bc8fff18626619 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -458,7 +458,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
   `[i1, i2]` then an appropriate signature for `fn` in `python2` is:
-  `fn = lambda (acc_p1, acc_p2), (t1 [t2, t3]):` and `fn` must return a list,
+  `fn = lambda (acc_p1, acc_p2), (t1, [t2, t3]):` and `fn` must return a list,
   `[acc_n1, acc_n2]`.  An alternative correct signature for `fn`, and the
    one that works in `python3`, is:
   `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 5d4b9ecd8bee31c5092b04535e97b036eec9f1be..230b6c5946ab7e0e3529994f7199026865cecde2 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -52,7 +53,6 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Warn the user if we convert a sparse representation to dense with at
 # least this number of elements.
 _LARGE_SPARSE_NUM_ELEMENTS = 100000000
@@ -235,9 +235,10 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
           raise TypeError(
               "Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
               y.dtype)
-        new_grad_ys.append(array_ops.fill(
-            array_ops.shape(y), constant_op.constant(
-                1, dtype=y.dtype, name="grad_ys_%d" % i)))
+        new_grad_ys.append(
+            array_ops.fill(
+                array_ops.shape(y),
+                constant_op.constant(1, dtype=y.dtype, name="grad_ys_%d" % i)))
         continue
       if y.dtype.is_floating or y.dtype.is_integer:
         if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
@@ -492,11 +493,12 @@ def gradients(ys,
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
-    xs = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
-          else x
-          for x in xs]
-    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x",
-                                                            as_ref=True)
+    xs = [
+        x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x
+        for x in xs
+    ]
+    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
+        xs, name="x", as_ref=True)
     grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
 
     # The approach we take here is as follows: Create a list of all ops in the
@@ -513,9 +515,8 @@ def gradients(ys,
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
-    pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops,
-                                              from_ops,
-                                              colocate_gradients_with_ops)
+    pending_count, loop_state = _PendingCount(
+        ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)
 
     # Iterate over the collected ops.
     #
@@ -588,9 +589,8 @@ def gradients(ys,
           # output, it means that the cost does not depend on output[i],
           # therefore dC/doutput[i] is 0.
           for i, out_grad in enumerate(out_grads):
-            if (not isinstance(out_grad, ops.Tensor) and
-                not out_grad) and ((not grad_fn and is_func_call) or
-                                   _IsTrainable(op.outputs[i])):
+            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
+                (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])):
               # Only trainable outputs or outputs for a function call that
               # will use SymbolicGradient get a zero gradient. Gradient
               # functions should ignore the gradient for other outputs.
@@ -607,17 +607,17 @@ def gradients(ys,
               if grad_fn:
                 # If grad_fn was found, do not use SymbolicGradient even for
                 # functions.
-                in_grads = _MaybeCompile(
-                    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: grad_fn(op, *out_grads))
               else:
                 # For function call ops, we add a 'SymbolicGradient'
                 # node to the graph to compute gradients.
-                in_grads = _MaybeCompile(
-                    grad_scope, op, func_call, lambda: _SymGrad(op, out_grads))
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: _SymGrad(op, out_grads))
               in_grads = _AsList(in_grads)
               _VerifyGeneratedGradients(in_grads, op)
-              if gate_gradients and len(
-                  [x for x in in_grads if x is not None]) > 1:
+              if gate_gradients and len([x for x in in_grads
+                                         if x is not None]) > 1:
                 with ops.device(None):
                   with ops.colocate_with(None, ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
@@ -637,8 +637,8 @@ def gradients(ys,
                     "Incompatible shapes between op input and calculated "
                     "input gradient.  Forward operation: %s.  Input index: %d. "
                     "Original input shape: %s.  "
-                    "Calculated input gradient shape: %s"
-                    % (op.name, i, t_in.shape, in_grad.shape))
+                    "Calculated input gradient shape: %s" %
+                    (op.name, i, t_in.shape, in_grad.shape))
             _SetGrad(grads, t_in, in_grad)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=False)
@@ -670,8 +670,8 @@ def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state):
     pending_count[x.op._id] -= 1
     ready = (pending_count[x.op._id] == 0)
     if loop_state and not ready:
-      ready = (pending_count[x.op._id] > 0 and
-               control_flow_util.IsLoopSwitch(x.op))
+      ready = (
+          pending_count[x.op._id] > 0 and control_flow_util.IsLoopSwitch(x.op))
     # pylint: enable=protected-access
     if ready:
       if control_flow_util.IsLoopExit(x.op):
@@ -725,8 +725,8 @@ def _GetGrad(grads, t):
   if not op_grads:
     return None
   t_grad = op_grads[t.value_index]
-  assert not isinstance(t_grad, list), (
-      "gradients list should have been aggregated by now.")
+  assert not isinstance(
+      t_grad, list), ("gradients list should have been aggregated by now.")
   return t_grad
 
 
@@ -745,9 +745,8 @@ def _HandleNestedIndexedSlices(grad):
   else:
     assert isinstance(grad.values, ops.IndexedSlices)
     g = _HandleNestedIndexedSlices(grad.values)
-    return ops.IndexedSlices(g.values,
-                             array_ops.gather(grad.indices, g.indices),
-                             g.dense_shape)
+    return ops.IndexedSlices(g.values, array_ops.gather(
+        grad.indices, g.indices), g.dense_shape)
 
 
 def _AccumulatorShape(inputs):
@@ -849,8 +848,8 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
       AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
       AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
   ]:
-    raise ValueError("Invalid aggregation_method specified %s." %
-                     aggregation_method)
+    raise ValueError(
+        "Invalid aggregation_method specified %s." % aggregation_method)
   out_grads = _GetGrads(grads, op)
   for i, out_grad in enumerate(out_grads):
     if loop_state:
@@ -859,7 +858,8 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
         continue
     # Grads have to be Tensors or IndexedSlices
     if (isinstance(out_grad, collections.Sequence) and not all([
-        isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad
+        isinstance(g, (ops.Tensor, ops.IndexedSlices))
+        for g in out_grad
         if g is not None
     ])):
       raise TypeError("gradients have to be either all Tensors "
@@ -903,8 +903,8 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
         else:
           used = "add_n"
           out_grads[i] = _MultiDeviceAddN(out_grad)
-        logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
-                     len(out_grad), tensor_shape, used)
+        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
+                     tensor_shape, used)
       else:
         out_grad = math_ops._as_indexed_slices_list(
             [g for g in out_grad if g is not None])
@@ -967,7 +967,8 @@ def _hessian_vector_product(ys, xs, v):
   assert len(grads) == length
   elemwise_products = [
       math_ops.multiply(grad_elem, array_ops.stop_gradient(v_elem))
-      for grad_elem, v_elem in zip(grads, v) if grad_elem is not None
+      for grad_elem, v_elem in zip(grads, v)
+      if grad_elem is not None
   ]
 
   # Second backprop
@@ -975,8 +976,12 @@ def _hessian_vector_product(ys, xs, v):
 
 
 @tf_export("hessians")
-def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
-            gate_gradients=False, aggregation_method=None):
+def hessians(ys,
+             xs,
+             name="hessians",
+             colocate_gradients_with_ops=False,
+             gate_gradients=False,
+             aggregation_method=None):
   """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
 
   `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
@@ -1004,9 +1009,9 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
   """
   xs = _AsList(xs)
   kwargs = {
-      'colocate_gradients_with_ops': colocate_gradients_with_ops,
-      'gate_gradients': gate_gradients,
-      'aggregation_method': aggregation_method
+      "colocate_gradients_with_ops": colocate_gradients_with_ops,
+      "gate_gradients": gate_gradients,
+      "aggregation_method": aggregation_method
   }
   # Compute first-order derivatives and iterate for each x in xs.
   hessians = []
@@ -1031,8 +1036,7 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
     )
 
     _shape = array_ops.shape(x)
-    _reshaped_hessian = array_ops.reshape(
-        hessian.stack(), array_ops.concat((_shape, _shape), 0)
-    )
+    _reshaped_hessian = array_ops.reshape(hessian.stack(),
+                                          array_ops.concat((_shape, _shape), 0))
     hessians.append(_reshaped_hessian)
   return hessians
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 3b0b5a978c9f79dca9b87d3a7b6478b63e1fcb8d..de12c5f63f4357e0982dd2e16999caf2de0b30f8 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -49,6 +49,10 @@ See the @{$python/image} guide.
 @@grayscale_to_rgb
 @@hsv_to_rgb
 @@rgb_to_hsv
+@@rgb_to_yiq
+@@yiq_to_rgb
+@@rgb_to_yuv
+@@yuv_to_rgb
 @@convert_image_dtype
 @@adjust_brightness
 @@random_brightness
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 9bd452155cc574ed942f47e50dbaa029f2e5e231..22636fdbb3f5c512f25ad0c1d7eb4e18056da211 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -770,8 +770,9 @@ def resize_images(images,
     size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
           new size for the images.
     method: ResizeMethod.  Defaults to `ResizeMethod.BILINEAR`.
-    align_corners: bool. If true, exactly align all 4 corners of the input and
-                   output. Defaults to `false`.
+    align_corners: bool.  If True, the centers of the 4 corner pixels of the
+        input and output tensors are aligned, preserving the values at the
+        corner pixels. Defaults to `False`.
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
@@ -1668,3 +1669,110 @@ def non_max_suppression(boxes,
     return gen_image_ops._non_max_suppression_v2(boxes, scores, max_output_size,
                                                  iou_threshold)
     # pylint: enable=protected-access
+
+
+_rgb_to_yiq_kernel = [[0.299, 0.59590059, 0.2115],
+                      [0.587, -0.27455667, -0.52273617],
+                      [0.114, -0.32134392, 0.31119955]]
+
+
+def rgb_to_yiq(images):
+  """Converts one or more images from RGB to YIQ.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YIQ
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
+
+_yiq_to_rgb_kernel = [[1, 1, 1],
+                      [0.95598634, -0.27201283, -1.10674021],
+                      [0.6208248, -0.64720424, 1.70423049]]
+
+
+def yiq_to_rgb(images):
+  """Converts one or more images from YIQ to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  I value are in [-0.5957,0.5957] and Q value are in [-0.5226,0.5226].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
+
+_rgb_to_yuv_kernel = [[0.299, -0.14714119, 0.61497538],
+                      [0.587, -0.28886916, -0.51496512],
+                      [0.114, 0.43601035, -0.10001026]]
+
+
+def rgb_to_yuv(images):
+  """Converts one or more images from RGB to YUV.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YUV
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
+
+_yuv_to_rgb_kernel = [[1, 1, 1],
+                      [0, -0.394642334, 2.03206185],
+                      [1.13988303, -0.58062185, 0]]
+
+
+def yuv_to_rgb(images):
+  """Converts one or more images from YUV to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  U and V value are in [-0.5,0.5].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 0c5ed2150d83e35398a618c0b61de93303b9cfc8..82b77ee8e3792596ec4c50ac24da1a1c38cc634b 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -85,6 +85,64 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       self.assertAllClose(rgb_tf, rgb_np)
 
 
+class RGBToYIQTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YIQ and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yiq(batch0)
+        batch2 = image_ops.yiq_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yiq, split0))
+        split2 = list(map(image_ops.yiq_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
+class RGBToYUVTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YUV and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yuv(batch0)
+        batch2 = image_ops.yuv_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yuv, split0))
+        split2 = list(map(image_ops.yuv_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
 class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
 
   def _RGBToGrayscale(self, images):
@@ -194,11 +252,11 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     with self.test_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
-    
+
       x = constant_op.constant(x_np, shape=x_np.shape)
 
-      err_msg = 'Gamma should be a non-negative real number.'
-      
+      err_msg = "Gamma should be a non-negative real number."
+
       try:
         image_ops.adjust_gamma(x, gamma=-1)
       except Exception as e:
@@ -212,13 +270,13 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     with self.test_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
-    
+
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = constant_op.constant(-1.0, dtype=dtypes.float32)
-      
+
       image = image_ops.adjust_gamma(x, gamma=y)
-      
-      err_msg = 'Gamma should be a non-negative real number.'
+
+      err_msg = "Gamma should be a non-negative real number."
       try:
         image.eval()
       except Exception as e:
@@ -226,7 +284,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
           raise
       else:
         raise AssertionError("Exception not raised: %s" % err_msg)
-      
+
   def test_adjust_gamma_zero(self):
     """White image should be returned for gamma equal to zero"""
     with self.test_session():
@@ -253,13 +311,13 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 31, 45, 55, 63, 71, 78, 84],
-           [90, 95, 100, 105, 110, 115, 119, 123],
-           [127, 131, 135, 139, 142, 146, 149, 153],
-           [156, 159, 162, 165, 168, 171, 174, 177],
-           [180, 183, 186, 188, 191, 194, 196, 199],
-           [201, 204, 206, 209, 211, 214, 216, 218],
-           [221, 223, 225, 228, 230, 232, 234, 236],
+          [[0, 31, 45, 55, 63, 71, 78, 84], [
+              90, 95, 100, 105, 110, 115, 119, 123
+          ], [127, 131, 135, 139, 142, 146, 149, 153], [
+              156, 159, 162, 165, 168, 171, 174, 177
+          ], [180, 183, 186, 188, 191, 194, 196, 199], [
+              201, 204, 206, 209, 211, 214, 216, 218
+          ], [221, 223, 225, 228, 230, 232, 234, 236],
            [238, 241, 243, 245, 247, 249, 251, 253]],
           dtype=np.float32)
 
@@ -274,14 +332,12 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 0, 0, 0, 1, 1, 2, 3],
-           [4, 5, 6, 7, 9, 10, 12, 14],
-           [16, 18, 20, 22, 25, 27, 30, 33],
-           [36, 39, 42, 45, 49, 52, 56, 60],
-           [64, 68, 72, 76, 81, 85, 90, 95],
-           [100, 105, 110, 116, 121, 127, 132, 138],
-           [144, 150, 156, 163, 169, 176, 182, 189],
-           [196, 203, 211, 218, 225, 233, 241, 249]],
+          [[0, 0, 0, 0, 1, 1, 2, 3], [4, 5, 6, 7, 9, 10, 12, 14], [
+              16, 18, 20, 22, 25, 27, 30, 33
+          ], [36, 39, 42, 45, 49, 52, 56, 60], [64, 68, 72, 76, 81, 85, 90, 95],
+           [100, 105, 110, 116, 121, 127, 132, 138], [
+               144, 150, 156, 163, 169, 176, 182, 189
+           ], [196, 203, 211, 218, 225, 233, 241, 249]],
           dtype=np.float32)
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -425,8 +481,7 @@ class FlipImageBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.flip_left_right(inputs)
@@ -456,8 +511,7 @@ class FlipImageBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
@@ -508,8 +562,7 @@ class AdjustHueBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -553,8 +606,7 @@ class AdjustSaturationBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -609,10 +661,11 @@ class ResizeBilinearBenchmark(test.Benchmark):
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
-          name=("resize_bilinear_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+          name=("resize_bilinear_%s_%s_%s" % (image_size[0], image_size[1],
+                                              num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -659,8 +712,9 @@ class ResizeBicubicBenchmark(test.Benchmark):
           min_iters=20,
           name=("resize_bicubic_%s_%s_%s" % (image_size[0], image_size[1],
                                              num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -696,8 +750,8 @@ class ResizeAreaBenchmark(test.Benchmark):
     batch_size = 1
     num_ops = 1000
     img = variables.Variable(
-        random_ops.random_normal([batch_size, image_size[0],
-                                  image_size[1], num_channels]),
+        random_ops.random_normal(
+            [batch_size, image_size[0], image_size[1], num_channels]),
         name="img")
 
     deps = []
@@ -710,12 +764,13 @@ class ResizeAreaBenchmark(test.Benchmark):
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
-          sess, benchmark_op,
-          name=("resize_area_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (
-          results["name"],
-          1000*results["wall_time"] / (batch_size * num_ops)))
+          sess,
+          benchmark_op,
+          name=("resize_area_%s_%s_%s" % (image_size[0], image_size[1],
+                                          num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -789,8 +844,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
     saturation_adjusted_image = gen_image_ops.adjust_saturation(
         flt_image, saturation_factor)
-    return image_ops.convert_image_dtype(saturation_adjusted_image,
-                                         orig_dtype)
+    return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype)
 
   def testHalfSaturationFused(self):
     x_shape = [2, 2, 3]
@@ -895,7 +949,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith('flip_left_right'))
+      self.assertTrue(y.op.name.startswith("flip_left_right"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -906,7 +960,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith('random_flip_left_right'))
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
       count_unflipped = 0
@@ -937,7 +991,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith('flip_up_down'))
+      self.assertTrue(y.op.name.startswith("flip_up_down"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -948,7 +1002,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith('random_flip_up_down'))
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
       for _ in range(50):
@@ -978,7 +1032,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      self.assertTrue(y.op.name.startswith('transpose_image'))
+      self.assertTrue(y.op.name.startswith("transpose_image"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -1203,7 +1257,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
-      self.assertTrue(y.op.name.startswith('per_image_standardization'))
+      self.assertTrue(y.op.name.startswith("per_image_standardization"))
       y_tf = y.eval()
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
@@ -1375,9 +1429,10 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     # Each line is a test configuration:
     #   (offset_height, offset_width, target_height, target_width), err_msg
-    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"),
-                   ([0, -1, 3, 3], "offset_width must be >= 0"),
-                   ([0, 0, 0, 3], "target_height must be > 0"),
+    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), ([
+        0, -1, 3, 3
+    ], "offset_width must be >= 0"), ([0, 0, 0, 3],
+                                      "target_height must be > 0"),
                    ([0, 0, 3, 0], "target_width must be > 0"),
                    ([2, 0, 3, 3], "height must be >= target + offset"),
                    ([0, 2, 3, 3], "width must be >= target + offset"))
@@ -1388,7 +1443,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.name.startswith('crop_to_bounding_box'))
+    self.assertTrue(y.name.startswith("crop_to_bounding_box"))
 
 
 class CentralCropTest(test_util.TensorFlowTestCase):
@@ -1413,9 +1468,10 @@ class CentralCropTest(test_util.TensorFlowTestCase):
 
   def testCropping(self):
     x_shape = [4, 8, 1]
-    x_np = np.array([[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
-                     [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
-                    dtype=np.int32).reshape(x_shape)
+    x_np = np.array(
+        [[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
+         [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
+        dtype=np.int32).reshape(x_shape)
     y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1])
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
@@ -1432,7 +1488,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
       y = image_ops.central_crop(x, 0.33)
-      y_tf = y.eval(feed_dict={x:x_np})
+      y_tf = y.eval(feed_dict={x: x_np})
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1471,7 +1527,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     x_np = np.ones(x_shape, dtype=np.float32)
     with self.test_session(use_gpu=True):
       y = image_ops.central_crop(x_np, 1.0)
-      self.assertTrue(y.op.name.startswith('central_crop'))
+      self.assertTrue(y.op.name.startswith("central_crop"))
 
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
@@ -1544,15 +1600,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
   def testInt64(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     x = np.array(x).reshape(x_shape)
     y = np.array(y).reshape(y_shape)
@@ -1569,38 +1620,26 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
   def testPadding(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
     offset_height, offset_width = [1, 0]
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 1]
-    y = [0, 1, 2, 3,
-         0, 4, 5, 6,
-         0, 7, 8, 9]
+    y = [0, 1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9,
-         0, 0, 0]
+    y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3, 0,
-         4, 5, 6, 0,
-         7, 8, 9, 0]
+    y = [1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9, 0]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
@@ -1632,9 +1671,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
     #   x_shape, target_height, target_width
-    test_config = (([0, 2, 2], 2, 2),
-                   ([2, 0, 2], 2, 2),
-                   ([2, 2, 0], 2, 2))
+    test_config = (([0, 2, 2], 2, 2), ([2, 0, 2], 2, 2), ([2, 2, 0], 2, 2))
     offset_height, offset_width = [0, 0]
     x = []
 
@@ -1679,7 +1716,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.op.name.startswith('pad_to_bounding_box'))
+    self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
 
 
 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
@@ -1692,8 +1729,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
                               (bounding_box[2] - bounding_box[0]))
 
     image_size_np = np.array(image.shape, dtype=np.int32)
-    bounding_box_np = (np.array(
-        bounding_box, dtype=np.float32).reshape([1, 1, 4]))
+    bounding_box_np = (
+        np.array(bounding_box, dtype=np.float32).reshape([1, 1, 4]))
 
     aspect_ratios = []
     area_ratios = []
@@ -1738,7 +1775,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval(feed_dict={min_object_covered_placeholder: min_object_covered})
+        y_tf = y.eval(feed_dict={
+            min_object_covered_placeholder: min_object_covered
+        })
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -1832,7 +1871,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       bounding_box = constant_op.constant(
           [0.0, 0.0, 1.0, 1.0],
           shape=[4],
-          dtype=dtypes.float32,)
+          dtype=dtypes.float32,
+      )
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
           bounding_boxes=bounding_box,
@@ -1879,13 +1919,15 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
-  OPTIONS = [image_ops.ResizeMethod.BILINEAR,
-             image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-             image_ops.ResizeMethod.BICUBIC,
-             image_ops.ResizeMethod.AREA]
+  OPTIONS = [
+      image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+      image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA
+  ]
 
-  TYPES = [np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-           np.float16, np.float32, np.float64]
+  TYPES = [
+      np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.float16,
+      np.float32, np.float64
+  ]
 
   def _assertShapeInference(self, pre_shape, size, post_shape):
     # Try single image resize
@@ -1913,12 +1955,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     target_height = 6
     target_width = 4
 
@@ -1949,12 +1989,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     new_size = array_ops.placeholder(dtypes.int32, shape=(2))
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
@@ -2008,8 +2046,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.BILINEAR)
 
   def testReturnDtype(self):
-    target_shapes = [[6, 4], [3, 2], [array_ops.placeholder(dtypes.int32),
-                                      array_ops.placeholder(dtypes.int32)]]
+    target_shapes = [[6, 4], [3, 2], [
+        array_ops.placeholder(dtypes.int32),
+        array_ops.placeholder(dtypes.int32)
+    ]]
     for nptype in self.TYPES:
       image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
       for opt in self.OPTIONS:
@@ -2026,12 +2066,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     img_shape = [1, 6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     # Test size where width is specified as a tensor which is a sum
     # of two tensors.
     width_1 = constant_op.constant(1)
@@ -2053,15 +2091,11 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   def testResizeDown(self):
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
-    expected_data = [127, 64,
-                     64, 127,
-                     50, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    expected_data = [127, 64, 64, 127, 50, 100]
     target_height = 3
     target_width = 2
 
@@ -2087,39 +2121,31 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
-    data = [64, 32,
-            32, 64,
-            50, 100]
+    data = [64, 32, 32, 64, 50, 100]
     target_height = 6
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        64.0, 48.0, 32.0, 32.0,
-        48.0, 48.0, 48.0, 48.0,
-        32.0, 48.0, 64.0, 64.0,
-        41.0, 61.5, 82.0, 82.0,
-        50.0, 75.0, 100.0, 100.0,
-        50.0, 75.0, 100.0, 100.0]
+        64.0, 48.0, 32.0, 32.0, 48.0, 48.0, 48.0, 48.0, 32.0, 48.0, 64.0, 64.0,
+        41.0, 61.5, 82.0, 82.0, 50.0, 75.0, 100.0, 100.0, 50.0, 75.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.AREA] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA]:
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
+      ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
@@ -2132,41 +2158,29 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersTrue(self):
     img_shape = [1, 3, 2, 1]
-    data = [6, 3,
-            3, 6,
-            6, 9]
+    data = [6, 3, 3, 6, 6, 9]
     target_height = 5
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        6.0, 5.0, 4.0, 3.0,
-        4.5, 4.5, 4.5, 4.5,
-        3.0, 4.0, 5.0, 6.0,
-        4.5, 5.5, 6.5, 7.5,
-        6.0, 7.0, 8.0, 9.0
+        6.0, 5.0, 4.0, 3.0, 4.5, 4.5, 4.5, 4.5, 3.0, 4.0, 5.0, 6.0, 4.5, 5.5,
+        6.5, 7.5, 6.0, 7.0, 8.0, 9.0
     ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        6.0, 6.0, 3.0, 3.0,
-        3.0, 3.0, 6.0, 6.0,
-        3.0, 3.0, 6.0, 6.0,
-        6.0, 6.0, 9.0, 9.0,
-        6.0, 6.0, 9.0, 9.0
+        6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0,
+        9.0, 9.0, 6.0, 6.0, 9.0, 9.0
     ]
     # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
     # align_corners=True.
     expected_data[image_ops.ResizeMethod.AREA] = [
-        6.0, 6.0, 6.0, 3.0,
-        6.0, 6.0, 6.0, 3.0,
-        3.0, 3.0, 3.0, 6.0,
-        3.0, 3.0, 3.0, 6.0,
-        6.0, 6.0, 6.0, 9.0
+        6.0, 6.0, 6.0, 3.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 3.0, 3.0,
+        3.0, 6.0, 6.0, 6.0, 6.0, 9.0
     ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
       ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
@@ -2180,23 +2194,21 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpBicubic(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 128, 64, 64, 128, 128, 64, 64,
-            64, 64, 128, 128, 64, 64, 128, 128,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        128, 128, 64, 64, 128, 128, 64, 64, 64, 64, 128, 128, 64, 64, 128, 128,
+        50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100,
+        50, 50, 100, 100
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 8
     target_width = 8
-    expected_data = [128, 135, 96, 55, 64, 114, 134, 128,
-                     78, 81, 68, 52, 57, 118, 144, 136,
-                     55, 49, 79, 109, 103, 89, 83, 84,
-                     74, 70, 95, 122, 115, 69, 49, 55,
-                     100, 105, 75, 43, 50, 89, 105, 100,
-                     57, 54, 74, 96, 91, 65, 55, 58,
-                     70, 69, 75, 81, 80, 72, 69, 70,
-                     105, 112, 75, 36, 45, 92, 111, 105]
+    expected_data = [
+        128, 135, 96, 55, 64, 114, 134, 128, 78, 81, 68, 52, 57, 118, 144, 136,
+        55, 49, 79, 109, 103, 89, 83, 84, 74, 70, 95, 122, 115, 69, 49, 55, 100,
+        105, 75, 43, 50, 89, 105, 100, 57, 54, 74, 96, 91, 65, 55, 58, 70, 69,
+        75, 81, 80, 72, 69, 70, 105, 112, 75, 36, 45, 92, 111, 105
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2209,20 +2221,17 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeDownArea(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 64, 32, 16, 8, 4,
-            4, 8, 16, 32, 64, 128,
-            128, 64, 32, 16, 8, 4,
-            5, 10, 15, 20, 25, 30,
-            30, 25, 20, 15, 10, 5,
-            5, 10, 15, 20, 25, 30]
+    data = [
+        128, 64, 32, 16, 8, 4, 4, 8, 16, 32, 64, 128, 128, 64, 32, 16, 8, 4, 5,
+        10, 15, 20, 25, 30, 30, 25, 20, 15, 10, 5, 5, 10, 15, 20, 25, 30
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 4
     target_width = 4
-    expected_data = [73, 33, 23, 39,
-                     73, 33, 23, 39,
-                     14, 16, 19, 21,
-                     14, 16, 19, 21]
+    expected_data = [
+        73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2309,7 +2318,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith('resize_images'))
+      self.assertTrue(y.op.name.startswith("resize_images"))
 
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
@@ -2382,133 +2391,93 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
   def testPad(self):
     # Pad even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0,
-         0, 5, 6, 7, 8, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 8, 0]
     y_shape = [2, 6, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0, 0,
-         0, 5, 6, 7, 8, 0, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 0, 5, 6, 7, 8, 0, 0]
     y_shape = [2, 7, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad even along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0]
     y_shape = [4, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0]
     y_shape = [5, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCrop(self):
     # Crop even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [2, 3,
-         6, 7]
+    y = [2, 3, 6, 7]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along col.
-    x = [1, 2, 3, 4,  5,  6,
-         7, 8, 9, 10, 11, 12]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     x_shape = [2, 6, 1]
 
-    y = [2, 3, 4,
-         8, 9, 10]
+    y = [2, 3, 4, 8, 9, 10]
     y_shape = [2, 3, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop even along row.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [3, 4,
-         5, 6]
+    y = [3, 4, 5, 6]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along row.
-    x = [1,  2,
-         3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12,
-         13, 14,
-         15, 16]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
     x_shape = [8, 2, 1]
 
-    y = [3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12]
+    y = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     y_shape = [5, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCropAndPad(self):
     # Pad along row but crop along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0,
-         2, 3,
-         6, 7,
-         0, 0]
+    y = [0, 0, 2, 3, 6, 7, 0, 0]
     y_shape = [4, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop along row but pad along col.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [0, 3, 4, 0,
-         0, 5, 6, 0]
+    y = [0, 3, 4, 0, 0, 5, 6, 0]
     y_shape = [2, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
@@ -2589,7 +2558,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
     y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
-    self.assertTrue(y.op.name.startswith('resize_image_with_crop_or_pad'))
+    self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad"))
 
 
 def _SimpleColorRamp():
@@ -2858,8 +2827,8 @@ class GifTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       gif = io_ops.read_file(filename)
       image = image_ops.decode_gif(gif)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError, "can't process optimized gif"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "can't process optimized gif"):
         gif, image = sess.run([gif, image])
 
   def testValid(self):
@@ -2921,8 +2890,9 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       y = image_ops.convert_image_dtype(image, output_dtype)
       self.assertTrue(y.dtype == output_dtype)
       self.assertAllClose(y.eval(), y_np, atol=1e-5)
-      if output_dtype in [dtypes.float32, dtypes.float64,
-                          dtypes.int32, dtypes.int64]:
+      if output_dtype in [
+          dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64
+      ]:
         y_saturate = image_ops.convert_image_dtype(
             image, output_dtype, saturate=True)
         self.assertTrue(y_saturate.dtype == output_dtype)
@@ -2942,8 +2912,8 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128])
       self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255])
-      self._convert([0, 2 ** 32], dtypes.int64, dtypes.int32, [0, 1])
-      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2 ** 32])
+      self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
+      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
@@ -2964,20 +2934,14 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   def testConvertBetweenInt16AndInt8(self):
     with self.test_session(use_gpu=True):
       # uint8, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8,
-                    [0, 255])
-      self._convert([0, 255], dtypes.uint8, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, [0, 255])
+      self._convert([0, 255], dtypes.uint8, dtypes.uint16, [0, 255 * 256])
       # int8, uint16
-      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8,
-                    [0, 127])
-      self._convert([0, 127], dtypes.int8, dtypes.uint16,
-                    [0, 127 * 2 * 256])
+      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8, [0, 127])
+      self._convert([0, 127], dtypes.int8, dtypes.uint16, [0, 127 * 2 * 256])
       # int16, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16,
-                    [0, 255 * 128])
-      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16, [0, 255 * 128])
+      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16, [0, 255 * 256])
 
 
 class TotalVariationTest(test_util.TensorFlowTestCase):
@@ -3110,20 +3074,17 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     # The following are the sum of absolute differences between the pixels.
     # sum row dif = (4-1) + (7-2) = 3 + 5 = 8
     # sum col dif = (2-1) + (7-4) = 1 + 3 = 4
-    r = [[1, 2],
-         [4, 7]]
+    r = [[1, 2], [4, 7]]
 
     # Blue color channel.
     # sum row dif = 18 + 29 = 47
     # sum col dif = 7 + 18 = 25
-    g = [[11, 18],
-         [29, 47]]
+    g = [[11, 18], [29, 47]]
 
     # Green color channel.
     # sum row dif = 120 + 193 = 313
     # sum col dif = 47 + 120 = 167
-    b = [[73, 120],
-         [193, 313]]
+    b = [[73, 120], [193, 313]]
 
     # Combine the 3 color channels into a single 3-dim array.
     # The shape is (2, 2, 3) corresponding to (height, width and color).
@@ -3152,9 +3113,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
     # Combine these 3 images into a single array of shape (3, 2, 2, 3)
     # where the first dimension is for the image-number.
-    multi = np.vstack((a[np.newaxis, :],
-                       b[np.newaxis, :],
-                       c[np.newaxis, :]))
+    multi = np.vstack((a[np.newaxis, :], b[np.newaxis, :], c[np.newaxis, :]))
 
     # Check that TensorFlow correctly calculates the total variation
     # for each image individually and returns the correct array.
@@ -3210,6 +3169,46 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  def testInvalidShape(self):
+    # The boxes should be 2D of shape [num_boxes, 4].
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 2 but is rank 1'):
+      boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Dimension must be 4 but is 3'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    # The scores should be 1D of shape [num_boxes].
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 1 but is rank 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([[0.9]])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    # The max_output_size should be a scaler (0-D).
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, [3], 0.5)
+
+    # The iou_threshold should be a scaler (0-D).
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, [[0.5]])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 13a32c83d99363e687f7e2365a91c8e453c81c7e..3cbbf3412a2a1bd974354a5819d410b4074ab47d 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -277,20 +277,28 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
   # https://j-towns.github.io/papers/svd-derivative.pdf
   a = op.inputs[0]
   a_shape = a.get_shape().with_rank_at_least(2)
+  grad_s_mat = array_ops.matrix_diag(grad_s)
 
-  if op.get_attr("compute_uv"):
-    # TODO(rmlarsen): Make this work with complex types.
-    if a.dtype.is_complex:
-      raise NotImplementedError(
-          "SVD gradient is not implemented for complex types and "
-          "compute_uv=True.")
-    grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
-    grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
-    m = a_shape[-2].merge_with(grad_u_shape[-2])
-    n = a_shape[-1].merge_with(grad_v_shape[-2])
-    batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
-        grad_v_shape[:-2])
-    a_shape = batch_shape.concatenate([m, n])
+  if not op.get_attr("compute_uv"):
+    s, u, v = linalg_ops.svd(a, compute_uv=True)
+    grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True))
+    grad_a.set_shape(a_shape)
+    return grad_a
+
+  full_matrices = op.get_attr("full_matrices")
+
+  # TODO(rmlarsen): Make this work with complex types.
+  if a.dtype.is_complex:
+    raise NotImplementedError(
+        "SVD gradient is not implemented for complex types and "
+        "compute_uv=True.")
+  grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
+  grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
+  m = a_shape[-2].merge_with(grad_u_shape[-2])
+  n = a_shape[-1].merge_with(grad_v_shape[-2])
+  batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
+      grad_v_shape[:-2])
+  a_shape = batch_shape.concatenate([m, n])
 
   m = a_shape[-2].value
   n = a_shape[-1].value
@@ -300,12 +308,9 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
         "SVD gradient has not been implemented for input with unknown "
         "inner matrix shape.")
 
-  if not op.get_attr("compute_uv"):
-    s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True)
-  else:
-    s = op.outputs[0]
-    u = op.outputs[1]
-    v = op.outputs[2]
+  s = op.outputs[0]
+  u = op.outputs[1]
+  v = op.outputs[2]
 
   use_adjoint = False
   if m > n:
@@ -317,19 +322,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     grad_u, grad_v = grad_v, grad_u
 
   with ops.control_dependencies([grad_s, grad_u, grad_v]):
-    grad_s_mat = array_ops.matrix_diag(grad_s)
-    if not op.get_attr("compute_uv"):
-      if use_adjoint:
-        grad_a = math_ops.matmul(
-            v[..., :, :m], math_ops.matmul(u, grad_s_mat), adjoint_b=True)
-      else:
-        grad_a = math_ops.matmul(u,
-                                 math_ops.matmul(
-                                     grad_s_mat, v[..., :, :m], adjoint_b=True))
-      grad_a.set_shape(a_shape)
-      return grad_a
-
-    if op.get_attr("full_matrices") and abs(m - n) > 1:
+    if full_matrices and abs(m - n) > 1:
       raise NotImplementedError(
           "svd gradient is not implemented for abs(m - n) > 1 "
           "when full_matrices is True")
@@ -371,7 +364,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
       gv1t_v1 = math_ops.matmul(gv1t, v1)
       term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True)
 
-      if op.get_attr("full_matrices"):
+      if full_matrices:
         v2 = v[..., :, m:n]
         grad_v2 = grad_v[..., :, m:n]
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 72508eb4350f57bb06b3829890f92554677c98d5..8b3c61b9339734d6a596d92e93f7a69d32dddd12 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -28,8 +28,10 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.Reduction")
 class Reduction(object):
   """Types of loss reduction.
 
@@ -149,9 +151,10 @@ def _num_present(losses, weights, per_batch=False):
 def _num_elements(losses):
   """Computes the number of elements in `losses` tensor."""
   with ops.name_scope(None, "num_elements", values=[losses]) as scope:
-    return array_ops.size(losses, name=scope, out_type=losses.dtype)
+    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
+@tf_export("losses.compute_weighted_loss")
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -211,6 +214,7 @@ def compute_weighted_loss(
       return loss
 
 
+@tf_export("losses.absolute_difference")
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -258,6 +262,7 @@ def absolute_difference(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.cosine_distance")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -311,6 +316,7 @@ def cosine_distance(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.hinge_loss")
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -352,6 +358,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.huber_loss")
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -420,6 +427,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.log_loss")
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -471,6 +479,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 
 # TODO(b/37208492): Add reduction arg.
+@tf_export("losses.mean_pairwise_squared_error")
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -538,12 +547,13 @@ def mean_pairwise_squared_error(
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
       term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                              num_present_per_batch)
+                              num_present_per_batch-1)
 
       sum_diff = math_ops.reduce_sum(
           diffs, reduction_indices=reduction_indices, keep_dims=True)
-      term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                              math_ops.square(num_present_per_batch))
+      term2 = 2.0 * _safe_div(
+          math_ops.square(sum_diff),
+          math_ops.multiply(num_present_per_batch, num_present_per_batch-1))
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
@@ -557,6 +567,7 @@ def mean_pairwise_squared_error(
       return mean_loss
 
 
+@tf_export("losses.mean_squared_error")
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -604,6 +615,7 @@ def mean_squared_error(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.sigmoid_cross_entropy")
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -662,6 +674,7 @@ def sigmoid_cross_entropy(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.softmax_cross_entropy")
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -771,6 +784,7 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
+@tf_export("losses.sparse_softmax_cross_entropy")
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 3718c481c26afdd9f007ffc22a9e6ec44a1eb10e..b835d963869704f053de6c2f8a75ae1fa72e6a5d 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -30,8 +30,10 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.add_loss")
 def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
   """Adds a externally defined loss to the collection of losses.
 
@@ -43,6 +45,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     ops.add_to_collection(loss_collection, loss)
 
 
+@tf_export("losses.get_losses")
 def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
@@ -56,6 +59,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   return ops.get_collection(loss_collection, scope)
 
 
+@tf_export("losses.get_regularization_losses")
 def get_regularization_losses(scope=None):
   """Gets the list of regularization losses.
 
@@ -68,6 +72,7 @@ def get_regularization_losses(scope=None):
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
+@tf_export("losses.get_regularization_loss")
 def get_regularization_loss(scope=None, name="total_regularization_loss"):
   """Gets the total regularization loss.
 
@@ -85,6 +90,7 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
     return constant_op.constant(0.0)
 
 
+@tf_export("losses.get_total_loss")
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
diff --git a/tensorflow/python/ops/manip_grad.py b/tensorflow/python/ops/manip_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..573e8c0a0d4e0b6e5e2ca555e34234d9bea760dc
--- /dev/null
+++ b/tensorflow/python/ops/manip_grad.py
@@ -0,0 +1,32 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gradients for operators defined in manip_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import manip_ops
+
+
+@ops.RegisterGradient("Roll")
+def _RollGrad(op, grad):
+  # The gradient is just the roll reversed
+  shift = op.inputs[1]
+  axis = op.inputs[2]
+  roll_grad = manip_ops.roll(grad, -shift, axis)
+  return roll_grad, None, None
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5f39784f436be361ba0d06c87239ae9b409fb9e
--- /dev/null
+++ b/tensorflow/python/ops/manip_ops.py
@@ -0,0 +1,36 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators for manipulating tensors.
+
+@@roll
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+# pylint: disable=protected-access
+def roll(input, shift, axis):
+  return _gen_manip_ops.roll(input, shift, axis)
+
+roll.__doc__ = _gen_manip_ops.roll.__doc__
+# pylint: enable=protected-access
+
+_allowed_symbols = ['roll']
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index bca4c665d27f2513ed0029ae0c674f46a060567f..53308484c427e715f649a09f0dbe3f1448f18f5b 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -40,15 +40,16 @@ def _SumGrad(op, grad):
   """Gradient for Sum."""
   # Fast path for when reducing to a scalar and ndims is known: adds only
   # Reshape and Tile ops (and possibly a Shape).
-  if op.inputs[0].get_shape().ndims is not None:
+  input_0_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if input_0_shape is not None:
     axes = tensor_util.constant_value(op.inputs[1])
     if axes is not None:
-      rank = op.inputs[0].get_shape().ndims
+      rank = len(input_0_shape)
       if np.array_equal(axes, np.arange(rank)):  # Reduce all dims.
         grad = array_ops.reshape(grad, [1] * rank)
         # If shape is not fully defined (but rank is), we use Shape.
-        if op.inputs[0].get_shape().is_fully_defined():
-          input_shape = op.inputs[0].get_shape().as_list()
+        if None not in input_0_shape:
+          input_shape = input_0_shape
         else:
           input_shape = array_ops.shape(op.inputs[0])
         return [array_ops.tile(grad, input_shape), None]
@@ -96,9 +97,12 @@ def _MinGrad(op, grad):
 def _MeanGrad(op, grad):
   """Gradient for Mean."""
   sum_grad = _SumGrad(op, grad)[0]
-  input_size = op.inputs[0].get_shape().num_elements()
-  output_size = op.outputs[0].get_shape().num_elements()
-  if input_size is not None and output_size is not None:
+  input_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  output_shape = op.outputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if (input_shape is not None and output_shape is not None and
+      None not in input_shape and None not in output_shape):
+    input_size = np.prod(input_shape)
+    output_size = np.prod(output_shape)
     factor = input_size // max(output_size, 1)
     factor = constant_op.constant(factor, dtype=sum_grad.dtype)
   else:
@@ -106,7 +110,7 @@ def _MeanGrad(op, grad):
     output_shape = array_ops.shape(op.outputs[0])
     factor = _safe_shape_div(
         math_ops.reduce_prod(input_shape), math_ops.reduce_prod(output_shape))
-  return sum_grad / math_ops.cast(factor, sum_grad.dtype), None
+  return math_ops.truediv(sum_grad, math_ops.cast(factor, sum_grad.dtype)), None
 
 
 @ops.RegisterGradient("Prod")
@@ -169,8 +173,7 @@ def _SegmentMeanGrad(op, grad):
       array_ops.shape(op.inputs[1]),
       array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)
   ], 0)
-  ones = array_ops.fill(ones_shape,
-                        constant_op.constant(1, dtype=grad.dtype))
+  ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype))
   scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
@@ -226,16 +229,19 @@ def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
 
 
 def _SegmentMinOrMaxGrad(op, grad, is_sorted):
-  """Gradient for SegmentMin and (unsorted) SegmentMax. They share similar code."""
-  zeros = array_ops.zeros(array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype)
+  """Gradient for SegmentMin and (unsorted) SegmentMax.
+
+  They share similar code.
+  """
+  zeros = array_ops.zeros(
+      array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype)
 
   # Get the number of selected (minimum or maximum) elements in each segment.
   gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
   is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
   if is_sorted:
-    num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
-                                        op.inputs[1])
+    num_selected = math_ops.segment_sum(
+        math_ops.cast(is_selected, grad.dtype), op.inputs[1])
   else:
     num_selected = math_ops.unsorted_segment_sum(
         math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
@@ -330,7 +336,7 @@ def _SquareGrad(op, grad):
   # Added control dependencies to prevent 2*x from being computed too early.
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    return grad * (2.0 * x)
+    return math_ops.multiply(grad, math_ops.multiply(x, 2.0))
 
 
 @ops.RegisterGradient("Sqrt")
@@ -532,8 +538,8 @@ def _IgammaGrad(op, grad):
   # and Gamma'(a) can grow large.
   partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) - math_ops.lgamma(a))
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (None, array_ops.reshape(
+      math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Igammac")
@@ -559,15 +565,17 @@ def _BetaincGrad(op, grad):
 
   # Perform operations in log space before summing, because terms
   # can grow large.
-  log_beta = (gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b)
-              - gen_math_ops.lgamma(a + b))
-  partial_x = math_ops.exp(
-      (b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta)
+  log_beta = (
+      gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) -
+      gen_math_ops.lgamma(a + b))
+  partial_x = math_ops.exp((b - 1) * math_ops.log(1 - x) +
+                           (a - 1) * math_ops.log(x) - log_beta)
 
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,  # da
-          None,  # db
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (
+      None,  # da
+      None,  # db
+      array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Zeta")
@@ -731,10 +739,8 @@ def _ShapesFullySpecifiedAndEqual(x, y, grad):
   y_shape = y._shape_tuple()
   grad_shape = grad._shape_tuple()
   # pylint: enable=protected-access
-  return (x_shape == y_shape and
-          x_shape == grad_shape and
-          x_shape is not None and
-          None not in x_shape)
+  return (x_shape == y_shape and x_shape == grad_shape and
+          x_shape is not None and None not in x_shape)
 
 
 @ops.RegisterGradient("Add")
@@ -756,8 +762,12 @@ def _AddGrad(op, grad):
 
 @ops.RegisterGradient("Sub")
 def _SubGrad(op, grad):
+  """Gradient for Sub."""
   x = op.inputs[0]
   y = op.inputs[1]
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return grad, -grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
   # pylint: disable=protected-access
@@ -848,10 +858,10 @@ def _RealDivGrad(op, grad):
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
-      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx),
-      sx), array_ops.reshape(
-          math_ops.reduce_sum(grad * math_ops.realdiv(math_ops.realdiv(-x, y), y),
-                              ry), sy))
+      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(
+                  grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy))
 
 
 @ops.RegisterGradient("Pow")
@@ -946,8 +956,8 @@ def _SelectGrad(op, grad):
   c = op.inputs[0]
   x = op.inputs[1]
   zeros = array_ops.zeros_like(x)
-  return (None, array_ops.where(c, grad, zeros),
-          array_ops.where(c, zeros, grad))
+  return (None, array_ops.where(c, grad, zeros), array_ops.where(
+      c, zeros, grad))
 
 
 @ops.RegisterGradient("MatMul")
@@ -1009,21 +1019,20 @@ def _SparseMatMulGrad(op, grad):
   dtype_a = op.inputs[0].dtype
   dtype_b = op.inputs[1].dtype
   if not t_a and not t_b:
-    return (_SparseMatMul(
-        grad, op.inputs[1], dtype_a, transpose_b=True), _SparseMatMul(
-            op.inputs[0], grad, dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a, transpose_b=True),
+            _SparseMatMul(op.inputs[0], grad, dtype_b, transpose_a=True))
   elif not t_a and t_b:
-    return (_SparseMatMul(grad, op.inputs[1], dtype_a), _SparseMatMul(
-        grad, op.inputs[0], dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a),
+            _SparseMatMul(grad, op.inputs[0], dtype_b, transpose_a=True))
   elif t_a and not t_b:
-    return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_b=True),
+    return (_SparseMatMul(op.inputs[1], grad, dtype_a, transpose_b=True),
             _SparseMatMul(op.inputs[0], grad, dtype_b))
   elif t_a and t_b:
     return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_a=True,
-        transpose_b=True), _SparseMatMul(
-            grad, op.inputs[0], dtype_b, transpose_a=True, transpose_b=True))
+        op.inputs[1], grad, dtype_a, transpose_a=True, transpose_b=True),
+            _SparseMatMul(
+                grad, op.inputs[0], dtype_b, transpose_a=True,
+                transpose_b=True))
 
 
 @ops.RegisterGradient("Floor")
@@ -1127,8 +1136,8 @@ def _ComplexAbsGrad(op, grad):
   """Returns the gradient of ComplexAbs."""
   # TODO(b/27786104): The cast to complex could be removed once arithmetic
   # supports mixtures of complex64 and real values.
-  return (math_ops.complex(grad, array_ops.zeros_like(grad)) *
-          math_ops.sign(op.inputs[0]))
+  return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign(
+      op.inputs[0]))
 
 
 @ops.RegisterGradient("Cast")
@@ -1158,8 +1167,8 @@ def _CumsumGrad(op, grad):
   exclusive = op.get_attr("exclusive")
   reverse = op.get_attr("reverse")
   return [
-      math_ops.cumsum(
-          grad, axis, exclusive=exclusive, reverse=not reverse), None
+      math_ops.cumsum(grad, axis, exclusive=exclusive, reverse=not reverse),
+      None
   ]
 
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b8e8207bb24ad64d9e07a4585501a10741f5c9ab..9a8ac93de9dcc12c513b5ddd07cca9d863d19b8a 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1841,12 +1841,11 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
             axis,
-            keepdims=True,
-            reduction_indices=reduction_indices)) + my_max
+            keepdims=keepdims,
+            reduction_indices=reduction_indices))
     if not keepdims:
-      if isinstance(axis, int):
-        axis = [axis]
-      result = array_ops.squeeze(result, axis)
+      my_max = array_ops.reshape(my_max, array_ops.shape(result))
+    result += my_max
     return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index f95cf08de1aaa47550fa344dc9f964c4f812cd68..6e5fe74290a219d07945998be2677176ca693cd9 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -95,8 +95,8 @@ class MatmulBenchmark(test.Benchmark):
         num_items = n * m * k * 2
         throughput = num_items * num_iters / duration / 1e9
         print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' %
-              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:'
-               + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
+              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) +
+               ',ta:' + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
                duration, throughput))
 
     name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}')
@@ -112,7 +112,8 @@ class MatmulBenchmark(test.Benchmark):
     return duration
 
   def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
-    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)
+    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b,
+                   num_iters, dtype)
 
   def test_round(self, num_iters):
     dtypes = [np.float32, np.float64]
@@ -124,8 +125,8 @@ class MatmulBenchmark(test.Benchmark):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for n, m, k, (transpose_a, transpose_b) in itertools.product(
-          [200], [1, 8, 20], [10000], [(False, False), (True, False), (False,
-                                                                       True)]):
+          [200], [1, 8, 20], [10000], [(False, False), (True, False),
+                                       (False, True)]):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for (n, m, k), (transpose_a, transpose_b) in itertools.product(
diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py
index 5a9c0a7a4951bbbc1d201f6fbc557e9a996a3655..3df0c66ef9c50909dd8c03b75654d6cf0fd7d709 100644
--- a/tensorflow/python/ops/matmul_benchmark_test.py
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@@ -33,11 +33,11 @@ def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping BuildGraphTest %s", (n, m, k, transpose_a,
-                                                     transpose_b))
+      tf_logging.info("Skipping BuildGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing BuildGraphTest %s", (n, m, k, transpose_a,
-                                                  transpose_b))
+    tf_logging.info("Testing BuildGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -47,11 +47,11 @@ def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping RunGraphTest %s", (n, m, k, transpose_a,
-                                                   transpose_b))
+      tf_logging.info("Skipping RunGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing RunGraphTest %s", (n, m, k, transpose_a,
-                                                transpose_b))
+    tf_logging.info("Testing RunGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -71,40 +71,41 @@ class MatmulBenchmarkTest(googletest.TestCase):
   def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     graph = ops.Graph()
     with graph.as_default():
-      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b,
-                                   dtype)
+      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k,
+                                   transpose_a, transpose_b, dtype)
       gd = graph.as_graph_def()
-      dev=googletest.gpu_device_name()
+      dev = googletest.gpu_device_name()
       proto_expected = """
-      node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" }
-      node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" }
-      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" }
-      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "Variable" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \"""" + dev + """\" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "Variable_1" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \"""" + dev + """\" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \"""" + dev + """\" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \"""" + dev + """\" }
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \"""" + dev + """\" }
                        """
       self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
 
   def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     benchmark_instance = matmul_benchmark.MatmulBenchmark()
-    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a,
-                                            transpose_b, 1, dtype)
+    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m,
+                                            k, transpose_a, transpose_b, 1,
+                                            dtype)
     self.assertTrue(duration > 1e-6)
 
 
@@ -113,8 +114,8 @@ if __name__ == "__main__":
   index = 0
   for _dtype in dtypes:
     for _n, _m, (_transpose_a, _transpose_b) in itertools.product(
-        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), (False,
-                                                                       True)]):
+        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False),
+                                       (False, True)]):
       _k = _n
       setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index),
               BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype))
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index fc013b565b764f0d22df29f99e78cb97498c5ced..eebfb17085a568f48769f6df7dddd3ae2f799efc 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -21,10 +21,8 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -40,15 +38,6 @@ from tensorflow.python.platform import test
 @test_util.with_c_api
 class BatchNormalizationTest(test.TestCase):
 
-  def SetProducerVersion(self, graph, producer_version):
-    # The C API doesn't expose altering GraphDefVersions. We can indirectly set
-    # it via import_graph_def though.
-    graph_def = graph_pb2.GraphDef()
-    graph_def.versions.producer = producer_version
-    with graph.as_default():
-      importer.import_graph_def(graph_def)
-    assert graph.graph_def_versions.producer, producer_version
-
   def _npBatchNorm(self, x, m, v, beta, gamma, epsilon,
                    scale_after_normalization, shift_after_normalization):
     y = (x - m) / np.sqrt(v + epsilon)
@@ -65,7 +54,7 @@ class BatchNormalizationTest(test.TestCase):
   def _tfBatchNormV1(self, x, m, v, beta, gamma, epsilon,
                      scale_after_normalization):
     """Original implementation."""
-    self.SetProducerVersion(ops.get_default_graph(), 8)
+    test_util.set_producer_version(ops.get_default_graph(), 8)
     return gen_nn_ops._batch_norm_with_global_normalization(
         x, m, v, beta, gamma, epsilon, scale_after_normalization)
     # pylint: enable=protected-access
@@ -233,7 +222,7 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = 0.001
         for scale_after_normalization in [True, False]:
           # _batch_norm_with_global_normalization_grad is deprecated in v9
-          self.SetProducerVersion(ops.get_default_graph(), 8)
+          test_util.set_producer_version(ops.get_default_graph(), 8)
           grad = gen_nn_ops._batch_norm_with_global_normalization_grad(
               x, m, v, gamma, backprop, epsilon, scale_after_normalization)
           dx, dm, dv, db, dg = grad
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 0593ed2cfa64eca59ca02904ca71b4fd4936af1b..a08b836025d12178ab7acfbd70fcc7a47bc99532 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -278,7 +278,8 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
         grad_vals = sess.run([grad_x, grad_scale, grad_offset])
-        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format)
+        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
+                                               pop_var, epsilon, data_format)
         grad_internal_vals = sess.run(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index cfff73774b5e585ed702369b9a74ff34e0a5febb..5e6cafd6aad4a80d3436d9c270ac5e2341c2a5aa 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -89,52 +89,63 @@ def _Conv2DBackpropFilterGrad(op, grad):
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
-                                          op.inputs[1],
-                                          grad,
-                                          strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding"),
-                                          data_format=data_format),
-          nn_ops.conv3d_backprop_filter_v2(op.inputs[0],
-                                           array_ops.shape(op.inputs[1]),
-                                           grad,
-                                           strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding"),
-                                           data_format=data_format)]
+  return [
+      nn_ops.conv3d_backprop_input_v2(
+          array_ops.shape(op.inputs[0]),
+          op.inputs[1],
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format),
+      nn_ops.conv3d_backprop_filter_v2(
+          op.inputs[0],
+          array_ops.shape(op.inputs[1]),
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("Conv3DBackpropInputV2")
 def _Conv3DBackpropInputGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [None,
-          nn_ops.conv3d_backprop_filter_v2(grad,
-                                           array_ops.shape(op.inputs[1]),
-                                           op.inputs[2],
-                                           strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding"),
-                                           data_format=data_format),
-          nn_ops.conv3d(grad,
-                        op.inputs[1],
-                        strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"),
-                        data_format=data_format)]
+  return [
+      None,
+      nn_ops.conv3d_backprop_filter_v2(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format),
+      nn_ops.conv3d(
+          grad,
+          op.inputs[1],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("Conv3DBackpropFilterV2")
 def _Conv3DBackpropFilterGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
-                                          grad,
-                                          op.inputs[2],
-                                          strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding"),
-                                          data_format=data_format),
-          None,
-          nn_ops.conv3d(op.inputs[0],
-                        grad,
-                        strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"),
-                        data_format=data_format)]
+  return [
+      nn_ops.conv3d_backprop_input_v2(
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format), None,
+      nn_ops.conv3d(
+          op.inputs[0],
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("AvgPool3D")
@@ -150,12 +161,13 @@ def _AvgPool3DGrad(op, grad):
 
 @ops.RegisterGradient("AvgPool3DGrad")
 def _AvgPool3DGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool3d(
-      grad,
-      op.get_attr("ksize"),
-      op.get_attr("strides"),
-      op.get_attr("padding"),
-      data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]),
+          gen_nn_ops.avg_pool3d(
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool3D")
@@ -173,9 +185,9 @@ def _MaxPool3DGrad(op, grad):
 @ops.RegisterGradient("MaxPool3DGrad")
 def _MaxPool3DGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool3d_grad_grad(
               op.inputs[0],
               op.inputs[1],
@@ -189,9 +201,9 @@ def _MaxPool3DGradGrad(op, grad):
 @ops.RegisterGradient("MaxPool3DGradGrad")
 def _MaxPool3DGradGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool3d_grad(
               op.inputs[0],
               op.inputs[1],
@@ -272,8 +284,9 @@ def _BiasAddGrad(op, received_grad):
     data_format = op.get_attr("data_format")
   except ValueError:
     data_format = None
-  return (received_grad, gen_nn_ops.bias_add_grad(out_backprop=received_grad,
-                                                  data_format=data_format))
+  return (received_grad,
+          gen_nn_ops.bias_add_grad(
+              out_backprop=received_grad, data_format=data_format))
 
 
 @ops.RegisterGradient("BiasAddGrad")
@@ -346,10 +359,9 @@ def _ReluGrad(op, grad):
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
-          array_ops.where(elu_x < 0,
-                          grad * op.inputs[0],
-                          array_ops.zeros(shape=array_ops.shape(elu_x),
-                                          dtype=elu_x.dtype)))
+          array_ops.where(elu_x < 0, grad * op.inputs[0],
+                          array_ops.zeros(
+                              shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
 
 
 @ops.RegisterGradient("SeluGrad")
@@ -357,9 +369,11 @@ def _SeluGradGrad(op, grad):
   x = op.inputs[1]
   scale_alpha = 1.7580993408473768599402175208123
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
-          array_ops.where(
-              x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + scale_alpha),
-              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
+          array_ops.where(x < 0.,
+                          gen_nn_ops._elu_grad(grad,
+                                               op.outputs[0] + scale_alpha),
+                          array_ops.zeros(
+                              shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -370,8 +384,8 @@ def _Relu6Grad(op, grad):
 @ops.RegisterGradient("Relu6Grad")
 def _Relu6GradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu6_grad(grad, x), array_ops.zeros(
-      shape=array_ops.shape(x), dtype=x.dtype))
+  return (gen_nn_ops._relu6_grad(grad, x),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
 @ops.RegisterGradient("Elu")
@@ -410,8 +424,8 @@ def _SoftsignGrad(op, grad):
 @ops.RegisterGradient("ReluGrad")
 def _ReluGradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu_grad(grad, x), array_ops.zeros(
-      shape=array_ops.shape(x), dtype=x.dtype))
+  return (gen_nn_ops._relu_grad(grad, x),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
 def _BroadcastMul(vec, mat):
@@ -455,8 +469,8 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
-        math_ops.matmul(grad_grad[:, None, :],
-                        softmax[:, :, None]), axis=1)) * softmax)
+        math_ops.matmul(grad_grad[:, None, :], softmax[:, :, None]), axis=1)) *
+             softmax)
 
   return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
 
@@ -473,7 +487,8 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
   # so we make sure we prevent silently incorrect results by raising
   # an error if the second derivative is requested via prevent_gradient.
   sparse_softmax_grad_without_gradient = array_ops.prevent_gradient(
-      op.outputs[1], message="Currently there is no way to take the second "
+      op.outputs[1],
+      message="Currently there is no way to take the second "
       "derivative of sparse_softmax_cross_entropy_with_logits due to the fused "
       "implementation's interaction with tf.gradients()")
   return _BroadcastMul(grad_0, sparse_softmax_grad_without_gradient), None
@@ -531,14 +546,16 @@ def _DepthwiseConv2dNativeGrad(op, grad):
 
 @ops.RegisterGradient("Dilation2D")
 def _Dilation2DGrad(op, grad):
-  return [nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
-                                           op.get_attr("strides"),
-                                           op.get_attr("rates"),
-                                           op.get_attr("padding")),
-          nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
-                                            op.get_attr("strides"),
-                                            op.get_attr("rates"),
-                                            op.get_attr("padding"))]
+  return [
+      nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
+                                       op.get_attr("strides"),
+                                       op.get_attr("rates"),
+                                       op.get_attr("padding")),
+      nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
+                                        op.get_attr("strides"),
+                                        op.get_attr("rates"),
+                                        op.get_attr("padding"))
+  ]
 
 
 @ops.RegisterGradient("LRN")
@@ -547,8 +564,10 @@ def _LRNGrad(op, grad):
   bias = op.get_attr("bias")
   alpha = op.get_attr("alpha")
   beta = op.get_attr("beta")
-  return [gen_nn_ops._lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius,
-                               bias, alpha, beta)]
+  return [
+      gen_nn_ops._lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius,
+                           bias, alpha, beta)
+  ]
 
 
 @ops.RegisterGradient("AvgPool")
@@ -564,54 +583,58 @@ def _AvgPoolGrad(op, grad):
 
 @ops.RegisterGradient("AvgPoolGrad")
 def _AvgPoolGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool(
-      grad,
-      op.get_attr("ksize"),
-      op.get_attr("strides"),
-      op.get_attr("padding"),
-      data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]),
+          gen_nn_ops._avg_pool(
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool")
 def _MaxPoolGrad(op, grad):
-  return gen_nn_ops._max_pool_grad(op.inputs[0],
-                                   op.outputs[0],
-                                   grad,
-                                   op.get_attr("ksize"),
-                                   op.get_attr("strides"),
-                                   padding=op.get_attr("padding"),
-                                   data_format=op.get_attr("data_format"))
+  return gen_nn_ops._max_pool_grad(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format"))
 
 
 @ops.RegisterGradient("MaxPoolV2")
 def _MaxPoolGradV2(op, grad):
   ksize = op.inputs[1]
   strides = op.inputs[2]
-  return gen_nn_ops.max_pool_grad_v2(op.inputs[0],
-                                     op.outputs[0],
-                                     grad,
-                                     ksize,
-                                     strides,
-                                     padding=op.get_attr("padding"),
-                                     data_format=op.get_attr("data_format")), None, None
+  return gen_nn_ops.max_pool_grad_v2(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      ksize,
+      strides,
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format")), None, None
 
 
 @ops.RegisterGradient("MaxPoolWithArgmax")
 def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
-  return gen_nn_ops._max_pool_grad_with_argmax(op.inputs[0],
-                                               grad,
-                                               op.outputs[1],
-                                               op.get_attr("ksize"),
-                                               op.get_attr("strides"),
-                                               padding=op.get_attr("padding"))
+  return gen_nn_ops._max_pool_grad_with_argmax(
+      op.inputs[0],
+      grad,
+      op.outputs[1],
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      padding=op.get_attr("padding"))
 
 
 @ops.RegisterGradient("MaxPoolGrad")
 def _MaxPoolGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool_grad_grad(
               op.inputs[0],
               op.inputs[1],
@@ -627,9 +650,9 @@ def _MaxPoolGradGradV2(op, grad):
   ksize = op.inputs[3]
   strides = op.inputs[4]
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops.max_pool_grad_grad_v2(
               op.inputs[0],
               op.inputs[1],
@@ -643,9 +666,9 @@ def _MaxPoolGradGradV2(op, grad):
 @ops.RegisterGradient("MaxPoolGradGrad")
 def _MaxPoolGradGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool_grad(
               op.inputs[0],
               op.inputs[1],
@@ -674,10 +697,9 @@ def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
     Input backprop for FractionalMaxPool op.
   """
   # pylint: disable=protected-access
-  return gen_nn_ops._fractional_max_pool_grad(op.inputs[0], op.outputs[0],
-                                              grad_0, op.outputs[1],
-                                              op.outputs[2],
-                                              op.get_attr("overlapping"))
+  return gen_nn_ops._fractional_max_pool_grad(
+      op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2],
+      op.get_attr("overlapping"))
 
 
 @ops.RegisterGradient("FractionalAvgPool")
@@ -761,8 +783,9 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
   epsilon = op.get_attr("epsilon")
   data_format = op.get_attr("data_format")
   is_training = op.get_attr("is_training")
-  grad_fun = (gen_nn_ops.fused_batch_norm_grad_v2 if use_v2
-              else gen_nn_ops.fused_batch_norm_grad)
+  grad_fun = (
+      gen_nn_ops.fused_batch_norm_grad_v2
+      if use_v2 else gen_nn_ops.fused_batch_norm_grad)
   if is_training:
     return grad_fun(
         grad_y,
@@ -786,7 +809,7 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
         pop_mean,
         pop_var,
         epsilon=epsilon,
-        data_format='NHWC',
+        data_format="NHWC",
         is_training=is_training)
     if data_format == b"NCHW":
       dx = array_ops.transpose(dx, [0, 3, 1, 2])
@@ -803,18 +826,28 @@ def _FusedBatchNormV2Grad(op, *grad):
   return _BaseFusedBatchNormGrad(op, True, *grad)
 
 
-def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True):
+def _BatchNormGrad(grad_y,
+                   x,
+                   scale,
+                   pop_mean,
+                   pop_var,
+                   epsilon,
+                   data_format,
+                   is_training=True):
   """Returns the gradients for the 3 inputs of BatchNorm.
 
   Args:
     grad_y: A `Tensor` of 4 dimensions for gradient for y.
     x: A `Tensor` of 4 dimensions for x.
     scale: A `Tensor` of 1 dimension for scaling.
-    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False.
-    pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False.
+    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
+      is_training=False.
+    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
+      when is_training=False.
     epsilon: A small float number added to the variance of x.
     data_format: The data format for input. Either b"NHWC" or b"NCHW".
-    is_training: A bool value to indicate the operation is for training (default)
+    is_training: A bool value to indicate the operation is for training
+      (default)
         or inference.
 
   Returns:
@@ -900,7 +933,7 @@ def _FusedBatchNormGradGrad(op, *grad):
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
   grad_x, grad_scale, grad_offset = _BatchNormGrad(
-        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+      grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
   grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
   grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
@@ -954,14 +987,15 @@ def _TopKGrad(op, grad, _):
 
   # Substitute grad to appropriate locations and fill the rest with zeros,
   # finally reshaping it to the original input shape.
-  return [array_ops.reshape(
-      sparse_ops.sparse_to_dense(ind,
-                                 array_ops.reshape(
-                                     math_ops.reduce_prod(in_shape), [1]),
-                                 array_ops.reshape(grad, [-1]),
-                                 validate_indices=False),
-      in_shape), array_ops.zeros(
-          [], dtype=dtypes.int32)]
+  return [
+      array_ops.reshape(
+          sparse_ops.sparse_to_dense(
+              ind,
+              array_ops.reshape(math_ops.reduce_prod(in_shape), [1]),
+              array_ops.reshape(grad, [-1]),
+              validate_indices=False), in_shape),
+      array_ops.zeros([], dtype=dtypes.int32)
+  ]
 
 
 @ops.RegisterGradient("NthElement")
@@ -983,11 +1017,9 @@ def _NthElementGrad(op, grad):
   # dimension. If there are multiple elements then the gradient will be
   # divided between them.
   indicators = math_ops.cast(
-      math_ops.equal(array_ops.expand_dims(output, -1), input),
-      grad.dtype)
+      math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype)
 
   grad = array_ops.expand_dims(grad, -1)
-  num_selected = array_ops.expand_dims(
-      math_ops.reduce_sum(indicators, -1), -1)
+  num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)
 
   return [math_ops.div(indicators, num_selected) * grad, None]
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index f7541c0e892819beaf27ad97d7d41b8f963a4ab9..aa7539ae9f09163bf1a2cc9f7dfcc6fc06737ae8 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -30,17 +30,20 @@ from tensorflow.python.platform import test
 
 
 class Relu6OpTest(test.TestCase):
+
   def testRelu6GradGrad(self):
-    inputs = constant_op.constant([[-2, -1, 1, 3], [5, 7, 8, 9]],
-                                  dtype=dtypes.float32)
+    inputs = constant_op.constant(
+        [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
     x_init_value = np.array([[-3.5, -1.5, 2, 4], [4.5, 7.5, 8.5, 11]])
     r = nn_ops.relu6(inputs)
     r_g = gradients_impl.gradients(r, inputs)[0]
     with self.test_session():
       error = gradient_checker.compute_gradient_error(
-        inputs, inputs.get_shape().as_list(),
-        r_g, r_g.get_shape().as_list(),
-        x_init_value=x_init_value)
+          inputs,
+          inputs.get_shape().as_list(),
+          r_g,
+          r_g.get_shape().as_list(),
+          x_init_value=x_init_value)
       self.assertLess(error, 1e-4)
 
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 3268fd0e0ac312dc8a15e9fef8a14f540dcb55e1..55fcd176d62009b9c29afb763dc20daf78cdb5d9 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -969,7 +969,8 @@ def _compute_sampled_logits(weights,
                             subtract_log_q=True,
                             remove_accidental_hits=False,
                             partition_strategy="mod",
-                            name=None):
+                            name=None,
+                            seed=None):
   """Helper function for nce_loss and sampled_softmax_loss functions.
 
   Computes sampled output training logits and labels suitable for implementing
@@ -1007,6 +1008,8 @@ def _compute_sampled_logits(weights,
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+        the op-level random seed for candidate sampling.
   Returns:
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
@@ -1036,7 +1039,8 @@ def _compute_sampled_logits(weights,
           num_true=num_true,
           num_sampled=num_sampled,
           unique=True,
-          range_max=num_classes)
+          range_max=num_classes,
+          seed=seed)
     # NOTE: pylint cannot tell that 'sampled_values' is a sequence
     # pylint: disable=unpacking-non-sequence
     sampled, true_expected_count, sampled_expected_count = (
@@ -1255,7 +1259,8 @@ def sampled_softmax_loss(weights,
                          sampled_values=None,
                          remove_accidental_hits=True,
                          partition_strategy="mod",
-                         name="sampled_softmax_loss"):
+                         name="sampled_softmax_loss",
+                         seed=None):
   """Computes and returns the sampled softmax training loss.
 
   This is a faster way to train a softmax classifier over a huge number of
@@ -1316,6 +1321,8 @@ def sampled_softmax_loss(weights,
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+        the op-level random seed for candidate sampling.
 
   Returns:
     A `batch_size` 1-D tensor of per-example sampled softmax losses.
@@ -1333,7 +1340,8 @@ def sampled_softmax_loss(weights,
       subtract_log_q=True,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy=partition_strategy,
-      name=name)
+      name=name,
+      seed=seed)
   sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 32b14f86b567ce26334c1594e9ac6f00afd5b9d1..a691e281ee7f0ce4ee3253069b095a88df5723c5 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -41,15 +41,19 @@ from tensorflow.python.ops.gen_nn_ops import *
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
 
 # pylint: disable=protected-access
 
 
-def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint: disable=redefined-builtin
-                            strides=None, name=None):
+def _non_atrous_convolution(
+    input,
+    filter,
+    padding,
+    data_format=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    name=None):
   """Computes sums of N-D convolutions (actually cross correlation).
 
   It is required that 1 <= N <= 3.
@@ -94,12 +98,13 @@ def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint:
     input_shape = input.get_shape()
     filter = ops.convert_to_tensor(filter, name="filter")
     filter_shape = filter.get_shape()
-    op = _NonAtrousConvolution(input_shape,
-                               filter_shape=filter_shape,
-                               padding=padding,
-                               data_format=data_format,
-                               strides=strides,
-                               name=scope)
+    op = _NonAtrousConvolution(
+        input_shape,
+        filter_shape=filter_shape,
+        padding=padding,
+        data_format=data_format,
+        strides=strides,
+        name=scope)
     return op(input, filter)
 
 
@@ -119,11 +124,14 @@ class _NonAtrousConvolution(object):
     name: see _non_atrous_convolution.
   """
 
-  def __init__(self,
-               input_shape,
-               filter_shape,  # pylint: disable=redefined-builtin
-               padding, data_format=None,
-               strides=None, name=None):
+  def __init__(
+      self,
+      input_shape,
+      filter_shape,  # pylint: disable=redefined-builtin
+      padding,
+      data_format=None,
+      strides=None,
+      name=None):
     filter_shape = filter_shape.with_rank(input_shape.ndims)
     self.padding = padding
     self.name = name
@@ -137,8 +145,8 @@ class _NonAtrousConvolution(object):
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
-      raise ValueError("len(strides)=%d, but should be %d" %
-                       (len(strides), conv_dims))
+      raise ValueError("len(strides)=%d, but should be %d" % (len(strides),
+                                                              conv_dims))
     if conv_dims == 1:
       # conv1d uses the 2-d data format names
       if data_format is None or data_format == "NWC":
@@ -177,8 +185,14 @@ class _NonAtrousConvolution(object):
   # those for gen_nn_ops.conv2d and gen_nn_ops.conv3d.
   # pylint: disable=redefined-builtin
   def _conv1d(self, input, filter, strides, padding, data_format, name):
-    return conv1d(value=input, filters=filter, stride=strides, padding=padding,
-                  data_format=data_format, name=name)
+    return conv1d(
+        value=input,
+        filters=filter,
+        stride=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
   # pylint: enable=redefined-builtin
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
@@ -340,13 +354,14 @@ def with_space_to_batch(
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
 
-  new_op = _WithSpaceToBatch(input_shape,
-                             dilation_rate,
-                             padding,
-                             build_op,
-                             filter_shape=filter_shape,
-                             spatial_dims=spatial_dims,
-                             data_format=data_format)
+  new_op = _WithSpaceToBatch(
+      input_shape,
+      dilation_rate,
+      padding,
+      build_op,
+      filter_shape=filter_shape,
+      spatial_dims=spatial_dims,
+      data_format=data_format)
   return new_op(input, None)
 
 
@@ -377,9 +392,8 @@ class _WithSpaceToBatch(object):
                spatial_dims=None,
                data_format=None):
     """Helper class for _with_space_to_batch."""
-    dilation_rate = ops.convert_to_tensor(dilation_rate,
-                                          dtypes.int32,
-                                          name="dilation_rate")
+    dilation_rate = ops.convert_to_tensor(
+        dilation_rate, dtypes.int32, name="dilation_rate")
     try:
       rate_shape = dilation_rate.get_shape().with_rank(1)
     except ValueError:
@@ -439,9 +453,7 @@ class _WithSpaceToBatch(object):
       if const_filter_shape is not None:
         filter_shape = const_filter_shape
         self.base_paddings = _with_space_to_batch_base_paddings(
-            const_filter_shape,
-            num_spatial_dims,
-            rate_or_const_rate)
+            const_filter_shape, num_spatial_dims, rate_or_const_rate)
       else:
         self.num_spatial_dims = num_spatial_dims
         self.rate_or_const_rate = rate_or_const_rate
@@ -478,9 +490,7 @@ class _WithSpaceToBatch(object):
       # shape was not fully defined.
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
-          filter_shape,
-          self.num_spatial_dims,
-          self.rate_or_const_rate)
+          filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -491,9 +501,7 @@ class _WithSpaceToBatch(object):
     paddings = _with_space_to_batch_adjust(paddings, 0, spatial_dims)
     crops = _with_space_to_batch_adjust(crops, 0, spatial_dims)
     input_converted = array_ops.space_to_batch_nd(
-        input=inp,
-        block_shape=dilation_rate,
-        paddings=paddings)
+        input=inp, block_shape=dilation_rate, paddings=paddings)
 
     result = self.op(input_converted, filter)
 
@@ -519,17 +527,17 @@ def _with_space_to_batch_base_paddings(filter_shape, num_spatial_dims,
   # Spatial dimensions of the filters and the upsampled filters in which we
   # introduce (rate - 1) zeros between consecutive filter values.
   filter_spatial_shape = filter_shape[:num_spatial_dims]
-  dilated_filter_spatial_shape = (filter_spatial_shape +
-                                  (filter_spatial_shape - 1) *
-                                  (rate_or_const_rate - 1))
+  dilated_filter_spatial_shape = (
+      filter_spatial_shape + (filter_spatial_shape - 1) *
+      (rate_or_const_rate - 1))
   pad_extra_shape = dilated_filter_spatial_shape - 1
 
   # When full_padding_shape is odd, we pad more at end, following the same
   # convention as conv2d.
   pad_extra_start = pad_extra_shape // 2
   pad_extra_end = pad_extra_shape - pad_extra_start
-  base_paddings = array_ops.stack([[pad_extra_start[i], pad_extra_end[i]]
-                                   for i in range(num_spatial_dims)])
+  base_paddings = array_ops.stack(
+      [[pad_extra_start[i], pad_extra_end[i]] for i in range(num_spatial_dims)])
   return base_paddings
 
 
@@ -623,8 +631,8 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   if strides is None:
     strides = [1] * num_spatial_dims
   elif len(strides) != num_spatial_dims:
-    raise ValueError("len(strides)=%d but should be %d" %
-                     (len(strides), num_spatial_dims))
+    raise ValueError("len(strides)=%d but should be %d" % (len(strides),
+                                                           num_spatial_dims))
   strides = np.array(strides, dtype=np.int32)
   if np.any(strides < 1):
     raise ValueError("all values of strides must be positive")
@@ -636,9 +644,14 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 
 @tf_export("nn.convolution")
-def convolution(input, filter,  # pylint: disable=redefined-builtin
-                padding, strides=None, dilation_rate=None,
-                name=None, data_format=None):
+def convolution(
+    input,
+    filter,  # pylint: disable=redefined-builtin
+    padding,
+    strides=None,
+    dilation_rate=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Computes sums of N-D convolutions (actually cross-correlation).
 
@@ -757,12 +770,14 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
     input_shape = input.get_shape()
     filter = ops.convert_to_tensor(filter, name="filter")
     filter_shape = filter.get_shape()
-    op = Convolution(input_shape,
-                     filter_shape,
-                     padding,
-                     strides=strides,
-                     dilation_rate=dilation_rate,
-                     name=name, data_format=data_format)
+    op = Convolution(
+        input_shape,
+        filter_shape,
+        padding,
+        strides=strides,
+        dilation_rate=dilation_rate,
+        name=name,
+        data_format=data_format)
     return op(input, filter)
 
 
@@ -786,8 +801,11 @@ class Convolution(object):
   def __init__(self,
                input_shape,
                filter_shape,
-               padding, strides=None, dilation_rate=None,
-               name=None, data_format=None):
+               padding,
+               strides=None,
+               dilation_rate=None,
+               name=None,
+               data_format=None):
     """Helper function for convolution."""
     num_total_dims = filter_shape.ndims
     if num_total_dims is None:
@@ -809,17 +827,17 @@ class Convolution(object):
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = input_shape[num_spatial_dims + 1]
-      spatial_dims = range(1, num_spatial_dims+1)
+      spatial_dims = range(1, num_spatial_dims + 1)
     else:
       input_channels_dim = input_shape[1]
-      spatial_dims = range(2, num_spatial_dims+2)
+      spatial_dims = range(2, num_spatial_dims + 2)
 
-    if not input_channels_dim.is_compatible_with(filter_shape[
-        num_spatial_dims]):
+    if not input_channels_dim.is_compatible_with(
+        filter_shape[num_spatial_dims]):
       raise ValueError(
           "number of input channels does not match corresponding dimension of "
-          "filter, {} != {}".format(input_channels_dim, filter_shape[
-              num_spatial_dims]))
+          "filter, {} != {}".format(input_channels_dim,
+                                    filter_shape[num_spatial_dims]))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -852,14 +870,15 @@ class Convolution(object):
 
 
 @tf_export("nn.pool")
-def pool(input,  # pylint: disable=redefined-builtin
-         window_shape,
-         pooling_type,
-         padding,
-         dilation_rate=None,
-         strides=None,
-         name=None,
-         data_format=None):
+def pool(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    padding,
+    dilation_rate=None,
+    strides=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Performs an N-D pooling operation.
 
@@ -941,8 +960,8 @@ def pool(input,  # pylint: disable=redefined-builtin
 
   """
   # pylint: enable=line-too-long
-  with ops.name_scope(name, "%s_pool" %
-                      (pooling_type.lower()), [input]) as scope:
+  with ops.name_scope(name, "%s_pool" % (pooling_type.lower()),
+                      [input]) as scope:
     input = ops.convert_to_tensor(input, name="input")
 
     num_spatial_dims = len(window_shape)
@@ -963,17 +982,18 @@ def pool(input,  # pylint: disable=redefined-builtin
           "strides > window_shape not supported due to inconsistency between "
           "CPU and GPU implementations")
 
-    pooling_ops = {("MAX", 1): max_pool,
-                   ("MAX", 2): max_pool,
-                   ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
-                   ("AVG", 1): avg_pool,
-                   ("AVG", 2): avg_pool,
-                   ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
-                  }
+    pooling_ops = {
+        ("MAX", 1): max_pool,
+        ("MAX", 2): max_pool,
+        ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
+        ("AVG", 1): avg_pool,
+        ("AVG", 2): avg_pool,
+        ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
+    }
     op_key = (pooling_type, num_spatial_dims)
     if op_key not in pooling_ops:
-      raise ValueError("%d-D %s pooling is not supported." %
-                       (op_key[1], op_key[0]))
+      raise ValueError("%d-D %s pooling is not supported." % (op_key[1],
+                                                              op_key[0]))
 
     if data_format is None or not data_format.startswith("NC"):
       adjusted_window_shape = [1] + list(window_shape) + [1]
@@ -1000,12 +1020,13 @@ def pool(input,  # pylint: disable=redefined-builtin
       if num_spatial_dims == 1:
         converted_input = array_ops.expand_dims(converted_input,
                                                 spatial_dims[0])
-      result = pooling_ops[op_key](converted_input,
-                                   adjusted_window_shape,
-                                   adjusted_strides,
-                                   converted_padding,
-                                   name=scope,
-                                   **data_format_kwargs)
+      result = pooling_ops[op_key](
+          converted_input,
+          adjusted_window_shape,
+          adjusted_strides,
+          converted_padding,
+          name=scope,
+          **data_format_kwargs)
       if num_spatial_dims == 1:
         result = array_ops.squeeze(result, [spatial_dims[0]])
       return result
@@ -1021,7 +1042,9 @@ def pool(input,  # pylint: disable=redefined-builtin
 
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
-  """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
+  """Atrous convolution (a.k.a.
+
+  convolution with holes or dilated convolution).
 
   This function is a simpler wrapper around the more general
   @{tf.nn.convolution}, and exists only for backwards compatibility. You can
@@ -1065,7 +1088,8 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
   that effectively use atrous convolution in different ways are, among others,
   [OverFeat: Integrated Recognition, Localization and Detection using
   Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-  Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+  Scanning with Deep Max-Pooling Convolutional Neural
+  Networks](http://arxiv.org/abs/1302.1700).
   Atrous convolution is also closely related to the so-called noble identities
   in multi-rate signal processing.
 
@@ -1156,13 +1180,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
 
 @tf_export("nn.conv2d_transpose")
-def conv2d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NHWC",
-                     name=None):
+def conv2d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
   """The transpose of `conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1207,15 +1232,16 @@ def conv2d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis], filter.get_shape()[2]))
+            "{} != {}".format(output_shape[axis],
+                              filter.get_shape()[2]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
@@ -1281,29 +1307,32 @@ def atrous_conv2d_transpose(value,
     if not value.get_shape()[3].is_compatible_with(filters.get_shape()[3]):
       raise ValueError(
           "value's input channels does not match filters' input channels, "
-          "{} != {}".format(value.get_shape()[3], filters.get_shape()[3]))
+          "{} != {}".format(value.get_shape()[3],
+                            filters.get_shape()[3]))
     if rate < 1:
       raise ValueError("rate {} cannot be less than one".format(rate))
 
     if rate == 1:
-      return conv2d_transpose(value,
-                              filters,
-                              output_shape,
-                              strides=[1, 1, 1, 1],
-                              padding=padding,
-                              data_format="NHWC")
+      return conv2d_transpose(
+          value,
+          filters,
+          output_shape,
+          strides=[1, 1, 1, 1],
+          padding=padding,
+          data_format="NHWC")
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filters.get_shape()[2].is_compatible_with(output_shape[3]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[3], filters.get_shape()[2]))
+            "{} != {}".format(output_shape[3],
+                              filters.get_shape()[2]))
 
     # We have two padding contributions. The first is used for converting "SAME"
     # to "VALID". The second is required so that the height and width of the
@@ -1352,14 +1381,13 @@ def atrous_conv2d_transpose(value,
     # component.
     space_to_batch_pad = [[0, pad_bottom_extra], [0, pad_right_extra]]
 
-    value = array_ops.space_to_batch(input=value,
-                                     paddings=space_to_batch_pad,
-                                     block_size=rate)
+    value = array_ops.space_to_batch(
+        input=value, paddings=space_to_batch_pad, block_size=rate)
 
-    input_sizes = [rate * rate * output_shape[0],
-                   (in_height + pad_bottom_extra) // rate,
-                   (in_width + pad_right_extra) // rate,
-                   output_shape[3]]
+    input_sizes = [
+        rate * rate * output_shape[0], (in_height + pad_bottom_extra) // rate,
+        (in_width + pad_right_extra) // rate, output_shape[3]
+    ]
 
     value = gen_nn_ops.conv2d_backprop_input(
         input_sizes=input_sizes,
@@ -1373,19 +1401,19 @@ def atrous_conv2d_transpose(value,
     batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra],
                            [pad_left, pad_right + pad_right_extra]]
 
-    return array_ops.batch_to_space(input=value,
-                                    crops=batch_to_space_crop,
-                                    block_size=rate)
+    return array_ops.batch_to_space(
+        input=value, crops=batch_to_space_crop, block_size=rate)
 
 
 @tf_export("nn.conv3d_transpose")
-def conv3d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NDHWC",
-                     name=None):
+def conv3d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
   """The transpose of `conv3d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1428,27 +1456,29 @@ def conv3d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
-      raise ValueError("output_shape must have shape (5,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (5,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
       if not filter.get_shape()[3].is_compatible_with(output_shape[4]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[4], filter.get_shape()[3]))
+            "{} != {}".format(output_shape[4],
+                              filter.get_shape()[3]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
                        " {}".format(padding))
 
-    return gen_nn_ops.conv3d_backprop_input_v2(input_sizes=output_shape_,
-                                               filter=filter,
-                                               out_backprop=value,
-                                               strides=strides,
-                                               padding=padding,
-                                               data_format=data_format,
-                                               name=name)
+    return gen_nn_ops.conv3d_backprop_input_v2(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 # pylint: disable=protected-access
@@ -1514,7 +1544,9 @@ def crelu(features, name=None, axis=-1):
   Concatenates a ReLU which selects only the positive part of the activation
   with a ReLU which selects only the *negative* part of the activation.
   Note that as a result this non-linearity doubles the depth of the activations.
-  Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201)
+  Source: [Understanding and Improving Convolutional Neural Networks via
+  Concatenated Rectified Linear Units. W. Shang, et
+  al.](https://arxiv.org/abs/1603.05201)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1534,7 +1566,9 @@ def crelu(features, name=None, axis=-1):
 @tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
-  Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
+
+  Source: [Convolutional Deep Belief Networks on CIFAR-10. A.
+  Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1622,14 +1656,16 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
       dimension of `logits`.
   """
+
   def _swap_axis(logits, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
-    return array_ops.transpose(logits,
-                               array_ops.concat([
-                                   math_ops.range(dim_index), [last_index],
-                                   math_ops.range(dim_index + 1, last_index),
-                                   [dim_index]
-                               ], 0), name=name)
+    return array_ops.transpose(
+        logits,
+        array_ops.concat([
+            math_ops.range(dim_index), [last_index],
+            math_ops.range(dim_index + 1, last_index), [dim_index]
+        ], 0),
+        name=name)
 
   logits = ops.convert_to_tensor(logits)
 
@@ -1746,9 +1782,12 @@ def _ensure_xent_args(name, sentinel, labels, logits):
 
 
 @tf_export("nn.softmax_cross_entropy_with_logits_v2")
-def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=invalid-name
-                                         labels=None, logits=None,
-                                         dim=-1, name=None):
+def softmax_cross_entropy_with_logits_v2(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1790,19 +1829,19 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits",
+                      [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        logits.dtype == dtypes.float16) else logits
+    precise_logits = math_ops.cast(
+        logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits
     # labels and logits must be of the same type
     labels = math_ops.cast(labels, precise_logits.dtype)
     input_rank = array_ops.rank(precise_logits)
@@ -1811,13 +1850,14 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
 
     # Move the dim to the end if dim is not the last dimension.
     if dim is not -1:
+
       def _move_dim_to_end(tensor, dim_index, rank):
-        return array_ops.transpose(tensor,
-                                   array_ops.concat([
-                                       math_ops.range(dim_index),
-                                       math_ops.range(dim_index + 1, rank),
-                                       [dim_index]
-                                   ], 0))
+        return array_ops.transpose(
+            tensor,
+            array_ops.concat([
+                math_ops.range(dim_index),
+                math_ops.range(dim_index + 1, rank), [dim_index]
+            ], 0))
 
       precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
       labels = _move_dim_to_end(labels, dim, input_rank)
@@ -1862,9 +1902,12 @@ See tf.nn.softmax_cross_entropy_with_logits_v2.
 
 @tf_export("nn.softmax_cross_entropy_with_logits")
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
-def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                      labels=None, logits=None,
-                                      dim=-1, name=None):
+def softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1906,11 +1949,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits_sg", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits_sg",
+                      [logits, labels]) as name:
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
@@ -1918,9 +1961,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits")
-def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                             labels=None, logits=None,
-                                             name=None):
+def sparse_softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1976,15 +2021,15 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
                       [labels, logits]):
     labels = ops.convert_to_tensor(labels)
     logits = ops.convert_to_tensor(logits)
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        dtypes.as_dtype(logits.dtype) == dtypes.float16) else logits
+    precise_logits = math_ops.cast(logits, dtypes.float32) if (dtypes.as_dtype(
+        logits.dtype) == dtypes.float16) else logits
 
     # Store label shape for result later.
     labels_static_shape = labels.get_shape()
     labels_shape = array_ops.shape(labels)
     if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
-      raise ValueError("Logits cannot be scalars - received shape %s." %
-                       logits.get_shape())
+      raise ValueError(
+          "Logits cannot be scalars - received shape %s." % logits.get_shape())
     if logits.get_shape().ndims is not None and (
         labels_static_shape.ndims is not None and
         labels_static_shape.ndims != logits.get_shape().ndims - 1):
@@ -2041,12 +2086,13 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "AvgPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._avg_pool(value,
-                                ksize=ksize,
-                                strides=strides,
-                                padding=padding,
-                                data_format=data_format,
-                                name=name)
+    return gen_nn_ops._avg_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 @tf_export("nn.max_pool")
@@ -2083,8 +2129,8 @@ def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2092,8 +2138,9 @@ def _calc_conv_flops(graph, node):
   filter_width = int(filter_shape[1])
   filter_in_depth = int(filter_shape[2])
   output_count = np.prod(output_shape.as_list())
-  return ops.OpStats("flops", (output_count * filter_in_depth * filter_height *
-                               filter_width * 2))
+  return ops.OpStats(
+      "flops",
+      (output_count * filter_in_depth * filter_height * filter_width * 2))
 
 
 @ops.RegisterStatistics("DepthwiseConv2dNative", "flops")
@@ -2101,8 +2148,8 @@ def _calc_depthwise_conv_flops(graph, node):
   """Calculates the compute resources needed for DepthwiseConv2dNative."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2210,9 +2257,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
       raise ValueError("keep_prob must be a scalar tensor or a float in the "
                        "range (0, 1], got %g" % keep_prob)
-    keep_prob = ops.convert_to_tensor(keep_prob,
-                                      dtype=x.dtype,
-                                      name="keep_prob")
+    keep_prob = ops.convert_to_tensor(
+        keep_prob, dtype=x.dtype, name="keep_prob")
     keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
     # Do nothing if we know keep_prob == 1
@@ -2222,9 +2268,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x)
     # uniform [keep_prob, 1.0 + keep_prob)
     random_tensor = keep_prob
-    random_tensor += random_ops.random_uniform(noise_shape,
-                                               seed=seed,
-                                               dtype=x.dtype)
+    random_tensor += random_ops.random_uniform(
+        noise_shape, seed=seed, dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
     ret = math_ops.div(x, keep_prob) * binary_tensor
@@ -2293,13 +2338,21 @@ def nth_element(input, n, reverse=False, name=None):
 
 @tf_export("nn.conv1d")
 @deprecation.deprecated_arg_values(
-    None, "`NCHW` for data_format is deprecated, use `NCW` instead",
-    warn_once=True, data_format="NCHW")
+    None,
+    "`NCHW` for data_format is deprecated, use `NCW` instead",
+    warn_once=True,
+    data_format="NCHW")
 @deprecation.deprecated_arg_values(
-    None, "`NHWC` for data_format is deprecated, use `NWC` instead",
-    warn_once=True, data_format="NHWC")
-def conv1d(value, filters, stride, padding,
-           use_cudnn_on_gpu=None, data_format=None,
+    None,
+    "`NHWC` for data_format is deprecated, use `NWC` instead",
+    warn_once=True,
+    data_format="NHWC")
+def conv1d(value,
+           filters,
+           stride,
+           padding,
+           use_cudnn_on_gpu=None,
+           data_format=None,
            name=None):
   r"""Computes a 1-D convolution given 3-D input and filter tensors.
 
@@ -2358,9 +2411,13 @@ def conv1d(value, filters, stride, padding,
       raise ValueError("data_format must be \"NWC\" or \"NCW\".")
     value = array_ops.expand_dims(value, spatial_start_dim)
     filters = array_ops.expand_dims(filters, 0)
-    result = gen_nn_ops.conv2d(value, filters, strides, padding,
-                               use_cudnn_on_gpu=use_cudnn_on_gpu,
-                               data_format=data_format)
+    result = gen_nn_ops.conv2d(
+        value,
+        filters,
+        strides,
+        padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        data_format=data_format)
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
@@ -2466,8 +2523,8 @@ def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2527,12 +2584,13 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
   with ops.name_scope(name, "erosion2d", [value, kernel]) as name:
     # Reduce erosion to dilation by duality.
     return math_ops.negative(
-        gen_nn_ops.dilation2d(input=math_ops.negative(value),
-                              filter=array_ops.reverse_v2(kernel, [0, 1]),
-                              strides=strides,
-                              rates=rates,
-                              padding=padding,
-                              name=name))
+        gen_nn_ops.dilation2d(
+            input=math_ops.negative(value),
+            filter=array_ops.reverse_v2(kernel, [0, 1]),
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name=name))
 
 
 @tf_export("nn.in_top_k")
@@ -2565,5 +2623,5 @@ def in_top_k(predictions, targets, k, name=None):
   Returns:
     A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
   """
-  with ops.name_scope(name, 'in_top_k'):
+  with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name)
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 5e9e71002705293403de83276fb70099d8864907..4ac2a8f634bb201c9aaecb74432f2e6e78ee840f 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -93,7 +93,8 @@ class Conv2DTest(test.TestCase):
     quantized_range = ((quantized_max - quantized_min) * range_adjust)
     range_scale = (quantized_range / number_of_steps)
     lowest_quantized = -(1 << (number_of_bits - 1))
-    result = np.array([(quantized_min + ((float(x) - lowest_quantized) * range_scale))
+    result = np.array([(quantized_min +
+                        ((float(x) - lowest_quantized) * range_scale))
                        for x in quantized.flatten()])
     return result
 
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index 4bf3b35e13879069e40162fc50180520a5f855f6..d590bc4be6d520cbaa000d9802b84cbfbf8e90b9 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -34,7 +34,10 @@ class QuantizedOpsTest(test.TestCase):
   def testQuantizeOp(self):
     expected_output = [1, 1, 2, 127, 255, 255]
     with self.test_session(use_gpu=False) as sess:
-      x = constant_op.constant([1.0, 1.25, 1.75, 127.0, 255.0, 500.0], shape=[6], dtype=dtypes.float32)
+      x = constant_op.constant(
+          [1.0, 1.25, 1.75, 127.0, 255.0, 500.0],
+          shape=[6],
+          dtype=dtypes.float32)
       x_min = 0.0
       x_max = 255.0
       op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 879c206313e476088b388f39a9a112f5cc449152..bdf41cd75d6432750b7b23391c28892e2d6b9ffc 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -348,11 +348,11 @@ class ResourceVariable(variables.Variable):
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     self._save_slice_info = None
-    self._in_graph_mode = context.in_graph_mode()
     # Save the graph's container prefix for error checking. Reading the value of
     # the ResourceVariable from another Graph in Eager mode is an error.
     self._container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
-    with ops.control_dependencies(None):
+    with ops.init_scope():
+      self._in_graph_mode = context.in_graph_mode()
       with ops.name_scope(name, "Variable", []
                           if init_from_fn else [initial_value]) as name:
         # pylint: disable=protected-access
@@ -835,25 +835,45 @@ class ResourceVariable(variables.Variable):
       return self.value()
 
   def __iadd__(self, unused_other):
-    raise RuntimeError("Variable += value not supported.")
+    raise RuntimeError("Variable += value not supported. Use "
+                       "variable.assign_add(value) to modify the variable "
+                       "value and variable = variable + value to get a new "
+                       "Tensor object.")
 
   def __isub__(self, unused_other):
-    raise RuntimeError("Variable -= value not supported.")
+    raise RuntimeError("Variable -= value not supported. Use "
+                       "variable.assign_sub(value) to modify the variable "
+                       "value and variable = variable - value to get a new "
+                       "Tensor object.")
 
   def __imul__(self, unused_other):
-    raise RuntimeError("Variable *= value not supported.")
+    raise RuntimeError("Variable *= value not supported. Use "
+                       "variable.assign_mul(value) to modify the variable "
+                       "value and variable = variable * value to get a new "
+                       "Tensor object.")
 
   def __idiv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __itruediv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __irealdiv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __ipow__(self, unused_other):
-    raise RuntimeError("Variable **= value not supported.")
+    raise RuntimeError("Variable **= value not supported. Use "
+                       "value and variable = variable ** value to get a new "
+                       "Tensor object.")
 
 
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a1008f1c834f7c01af0ff8b3a0a648f499ce1f8a..e0052b8869dd2cf331c14e2355d4b40dd217c561 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -812,7 +812,10 @@ def _dynamic_rnn_loop(cell,
     return (time + 1, output_ta_t, new_state)
 
   if in_graph_mode:
-    loop_bound = max_sequence_length
+    # Make sure that we run at least 1 step, if necessary, to ensure
+    # the TensorArrays pick up the dynamic shape.
+    loop_bound = math_ops.minimum(
+        time_steps, math_ops.maximum(1, max_sequence_length))
   else:
     # Using max_sequence_length isn't currently supported in the Eager branch.
     loop_bound = time_steps
@@ -1122,6 +1125,12 @@ def raw_rnn(cell, loop_fn,
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
         def copy_fn(cur_i, cand_i):
+          # TensorArray and scalar get passed through.
+          if isinstance(cur_i, tensor_array_ops.TensorArray):
+            return cand_i
+          if cur_i.shape.ndims == 0:
+            return cand_i
+          # Otherwise propagate the old or the new value.
           with ops.colocate_with(cand_i):
             return array_ops.where(elements_finished, cur_i, cand_i)
         return nest.map_structure(copy_fn, current, candidate)
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 4b5072fd6799ae289d3c1a1b2a40878e36604bf4..1b9071ee93c21f8d6bdc9ace11dbf57f3eb3e218 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -50,19 +50,21 @@ class EagerFunc(object):
     self._func = func
     self._out_dtypes = Tout
 
-  def __call__(self, *args, **kwargs):
-    """Passes args, kwargs to `self._func`, which is executed eagerly."""
+  def __call__(self, on_gpu, args):
+    """Passes `args` to `self._func`, which is executed eagerly."""
     with context.eager_mode():
-      ret = self._func(*args, **kwargs)
+      ret = self._func(*args)
+      maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
       if isinstance(ret, (tuple, list)):
         return [
-            ops.convert_to_tensor(x, dtype=dtype)
+            maybe_copy_to_gpu(ops.convert_to_tensor(x, dtype=dtype))
             for (x, dtype) in zip(ret, self._out_dtypes)
         ]
       elif ret is None:
         return ret
       else:
-        return ops.convert_to_tensor(ret, dtype=self._out_dtypes[0])
+        return maybe_copy_to_gpu(
+            ops.convert_to_tensor(ret, dtype=self._out_dtypes[0]))
 
 
 class FuncRegistry(object):
@@ -116,16 +118,29 @@ class FuncRegistry(object):
     else:
       return result
 
-  def __call__(self, token, args):
-    """Calls the registered function for `token` with args."""
+  def __call__(self, token, on_gpu, args):
+    """Calls the registered function for `token` with args.
+
+    Args:
+      token: A key into this `FuncRegistry` identifying which function to call.
+      on_gpu: A boolean indicating whether or not `token`'s corresponding
+        operation was placed on GPU; only used if the function registered for
+        `token` is an `EagerPyFunc`.
+      args: The arguments to pass to the function registered for `token`.
+
+    Returns:
+      The output of the function registered for `token`.
+
+    Raises:
+      ValueError: if no function is registered for `token`.
+    """
     func = self._funcs[token]
     if func is None:
       raise ValueError("callback %s is not found" % token)
-    ret = func(*args)
-
     if isinstance(func, EagerFunc):
-      return ret
+      return func(on_gpu, args)
     else:
+      ret = func(*args)
       # Strings seem to lead to a memory leak here if they're not wrapped in a
       # list.
       if isinstance(ret, six.binary_type):
@@ -302,8 +317,5 @@ def py_func(func, inp, Tout, stateful=True, name=None):
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
 
-# TODO(akshayka): PyFuncs where the 'eager' attribute is set to True should be
-# differentiable, i.e., the gradient of PyFunc should propagate Nones if the
-# eager attribute is not set, and otherwise, it should return the gradient.
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 3224856d7be0674a2cc064a226bf1a38abb6bc2b..0fbbf5a805f1439d85ad53f02bdb665c04248606 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -227,13 +227,14 @@ def sparse_concat(axis,
             [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0)
     shapes = [
         array_ops.concat([
-            max_shape[:axis], shape[-1:] if axis == -1 else
-            shape[axis:axis + 1], [] if axis == -1 else max_shape[axis + 1:]
+            max_shape[:axis], shape[-1:]
+            if axis == -1 else shape[axis:axis + 1], []
+            if axis == -1 else max_shape[axis + 1:]
         ], 0) for shape in shapes
     ]
 
-  output_ind, output_val, output_shape = (gen_sparse_ops._sparse_concat(
-      inds, vals, shapes, axis, name=name))
+  output_ind, output_val, output_shape = (
+      gen_sparse_ops._sparse_concat(inds, vals, shapes, axis, name=name))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -300,15 +301,14 @@ def sparse_add(a, b, thresh=0):
     b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
-    output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
-        a.indices, a.values, a.dense_shape,
-        b.indices, b.values, b.dense_shape,
-        thresh))
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops._sparse_add(a.indices, a.values, a.dense_shape,
+                                   b.indices, b.values, b.dense_shape, thresh))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
-    static_shape = array_ops.broadcast_static_shape(
-        a.get_shape(), b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(a.get_shape(),
+                                                    b.get_shape())
     if static_shape.is_fully_defined():
       output_shape = static_shape.as_list()
 
@@ -317,8 +317,8 @@ def sparse_add(a, b, thresh=0):
     # swap to make `a` the SparseTensor.
     if isinstance(b, sparse_classes):
       a, b = b, a
-    return gen_sparse_ops._sparse_tensor_dense_add(
-        a.indices, a.values, a.dense_shape, b)
+    return gen_sparse_ops._sparse_tensor_dense_add(a.indices, a.values,
+                                                   a.dense_shape, b)
 
 
 def _sparse_cross(inputs, name=None):
@@ -397,19 +397,25 @@ def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
-def _sparse_cross_internal(
-    inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None):
+def _sparse_cross_internal(inputs,
+                           hashed_output=False,
+                           num_buckets=0,
+                           hash_key=None,
+                           name=None):
   """See gen_sparse_ops._sparse_cross."""
   if not isinstance(inputs, list):
     raise TypeError("Inputs must be a list")
-  if not all(isinstance(i, sparse_tensor.SparseTensor) or
-             isinstance(i, ops.Tensor) for i in inputs):
+  if not all(
+      isinstance(i, sparse_tensor.SparseTensor) or isinstance(i, ops.Tensor)
+      for i in inputs):
     raise TypeError("All inputs must be SparseTensors")
 
-  sparse_inputs = [i for i in inputs
-                   if isinstance(i, sparse_tensor.SparseTensor)]
-  dense_inputs = [i for i in inputs
-                  if not isinstance(i, sparse_tensor.SparseTensor)]
+  sparse_inputs = [
+      i for i in inputs if isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  dense_inputs = [
+      i for i in inputs if not isinstance(i, sparse_tensor.SparseTensor)
+  ]
 
   indices = [sp_input.indices for sp_input in sparse_inputs]
   values = [sp_input.values for sp_input in sparse_inputs]
@@ -504,8 +510,9 @@ def sparse_reorder(sp_input, name=None):
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
+  reordered_ind, reordered_val = (
+      gen_sparse_ops._sparse_reorder(
+          sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
   if sp_input.get_shape().is_fully_defined():
     dense_shape = sp_input.get_shape().as_list()
@@ -572,8 +579,8 @@ def sparse_reshape(sp_input, shape, name=None):
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
     reshaped_shape_const = tensor_util.constant_value(shape)
-    if (reshaped_shape_const is not None
-        and sp_input.get_shape().is_fully_defined()):
+    if (reshaped_shape_const is not None and
+        sp_input.get_shape().is_fully_defined()):
       num_implied = sum((dim == -1) for dim in reshaped_shape_const)
       if num_implied > 1:
         raise ValueError("At most one dimension can be inferred (-1). Found: %s"
@@ -589,15 +596,15 @@ def sparse_reshape(sp_input, shape, name=None):
             in_shape_size // np.prod(non_implied_idx))
       reshaped_size = np.prod(reshaped_shape_const)
       if reshaped_size != in_shape_size:
-        raise ValueError(
-            "Cannot reshape a tensor with %d elements to shape %s "
-            "(%d elements)."
-            % (in_shape_size, original_reshaped_shape, reshaped_size))
+        raise ValueError("Cannot reshape a tensor with %d elements to shape %s "
+                         "(%d elements)." %
+                         (in_shape_size, original_reshaped_shape,
+                          reshaped_size))
       reshaped_shape = reshaped_shape_const
 
-    return sparse_tensor.SparseTensor(
-        reshaped_ind, array_ops.identity(sp_input.values),
-        reshaped_shape)
+    return sparse_tensor.SparseTensor(reshaped_ind,
+                                      array_ops.identity(sp_input.values),
+                                      reshaped_shape)
 
 
 # TODO(aselle): Remove keyword required once for 1.0 final
@@ -610,8 +617,11 @@ class KeywordRequired(object):
 
 @tf_export("sparse_split")
 def sparse_split(keyword_required=KeywordRequired(),
-                 sp_input=None, num_split=None, axis=None,
-                 name=None, split_dim=None):
+                 sp_input=None,
+                 num_split=None,
+                 axis=None,
+                 name=None,
+                 split_dim=None):
   """Split a `SparseTensor` into `num_split` tensors along `axis`.
 
   If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
@@ -660,18 +670,19 @@ def sparse_split(keyword_required=KeywordRequired(),
                                                 split_dim)
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  output_inds, output_vals, output_shapes = (gen_sparse_ops._sparse_split(
-      axis,
-      sp_input.indices,
-      sp_input.values,
-      sp_input.dense_shape,
-      num_split,
-      name=name))
+  output_inds, output_vals, output_shapes = (
+      gen_sparse_ops._sparse_split(
+          axis,
+          sp_input.indices,
+          sp_input.values,
+          sp_input.dense_shape,
+          num_split,
+          name=name))
   sparse_tensors = []
   for i in range(0, num_split):
     sparse_tensors.append(
-        sparse_tensor.SparseTensor(
-            output_inds[i], output_vals[i], output_shapes[i]))
+        sparse_tensor.SparseTensor(output_inds[i], output_vals[i],
+                                   output_shapes[i]))
   return sparse_tensors
 
 
@@ -713,12 +724,15 @@ def sparse_slice(sp_input, start, size, name=None):
 
   with ops.name_scope(name, "SparseSlice", [sp_input]) as name:
     output_indices, output_values, output_shape = gen_sparse_ops.sparse_slice(
-        sp_input.indices, sp_input.values, sp_input.dense_shape, start, size, name=name)
+        sp_input.indices,
+        sp_input.values,
+        sp_input.dense_shape,
+        start,
+        size,
+        name=name)
 
-    return sparse_tensor.SparseTensor(
-        output_indices,
-        output_values,
-        output_shape)
+    return sparse_tensor.SparseTensor(output_indices, output_values,
+                                      output_shape)
 
 
 @tf_export("sparse_to_dense")
@@ -819,14 +833,14 @@ def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_max(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
 @tf_export("sparse_reduce_max_sparse")
-def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
+def sparse_reduce_max_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the max of elements across dimensions of a SparseTensor.
 
@@ -855,10 +869,8 @@ def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_max_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -905,14 +917,14 @@ def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_sum(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
 @tf_export("sparse_reduce_sum_sparse")
-def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
+def sparse_reduce_sum_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
 
@@ -941,10 +953,8 @@ def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_sum_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -1053,8 +1063,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
   with ops.name_scope(name, "SparseToIndicator", [sp_input]) as name:
     num_entries = array_ops.shape(sp_input.indices)[0]
     new_values = array_ops.fill(array_ops.expand_dims(num_entries, 0), True)
-    sp_values = sparse_tensor.SparseTensor(
-        sp_input.indices, new_values, sp_input.dense_shape)
+    sp_values = sparse_tensor.SparseTensor(sp_input.indices, new_values,
+                                           sp_input.dense_shape)
 
     sp_new = sparse_merge(sp_input, sp_values, vocab_size, name)
 
@@ -1174,8 +1184,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
       raise TypeError("vocab_size has to be a list of Tensors or Python ints. "
                       "Found %s" % type(vocab_size))
     for dim in vocab_size:
-      if not (isinstance(dim, ops.Tensor) or
-              isinstance(dim, numbers.Integral)):
+      if not (isinstance(dim, ops.Tensor) or isinstance(dim, numbers.Integral)):
         raise TypeError(
             "vocab_size has to be a list of Tensors or Python ints. Found %s" %
             type(dim))
@@ -1326,24 +1335,23 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    output_shape_tensor_const = tensor_util.constant_value(
-        output_shape_tensor)
+    output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor)
     # For cases where all shapes are known during graph construction
-    if (output_shape_tensor_const is not None
-        and sp_input.get_shape().is_fully_defined()):
+    if (output_shape_tensor_const is not None and
+        sp_input.get_shape().is_fully_defined()):
       in_shape_const = np.array(sp_input.get_shape().as_list())
       if not np.all(in_shape_const <= output_shape_tensor_const):
         raise ValueError(
             "Requested new_shape should have dimension sizes >= sp_input.shape."
-            "  Found new_shape (%s), sp_input.shape (%s)."
-            % (in_shape_const, output_shape_tensor_const))
+            "  Found new_shape (%s), sp_input.shape (%s)." %
+            (in_shape_const, output_shape_tensor_const))
       output_shape_tensor = output_shape_tensor_const
     else:
       # For cases where shape is not known during graph construction.
-      output_shape_tensor = control_flow_ops.with_dependencies(
-          [check_ops.assert_equal(
-              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies([
+          check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))
+      ], output_shape_tensor)
       output_shape_tensor = control_flow_ops.with_dependencies(
           [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
           output_shape_tensor)
@@ -1409,10 +1417,10 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
          values=sp_input.values,
          dense_shape=sp_input.dense_shape,
          default_value=default_value)
-    return (sparse_tensor.SparseTensor(indices=output_indices,
-                                       values=output_values,
-                                       dense_shape=sp_input.dense_shape),
-            empty_row_indicator)
+    return (sparse_tensor.SparseTensor(
+        indices=output_indices,
+        values=output_values,
+        dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
 @tf_export("serialize_sparse")
@@ -1880,8 +1888,8 @@ def sparse_softmax(sp_input, name=None):
                       [sp_input.indices, sp_input.values]) as name:
     out_vals = gen_sparse_ops.sparse_softmax(sp_input.indices, sp_input.values,
                                              sp_input.dense_shape)
-    return sparse_tensor.SparseTensor(
-        sp_input.indices, out_vals, sp_input.dense_shape)
+    return sparse_tensor.SparseTensor(sp_input.indices, out_vals,
+                                      sp_input.dense_shape)
 
 
 @tf_export("sparse_maximum")
@@ -1907,9 +1915,9 @@ def sparse_maximum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMaximum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMaximum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_maximum(
         sp_a.indices,
         sp_a.values,
@@ -1944,9 +1952,9 @@ def sparse_minimum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMinimum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMinimum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_minimum(
         sp_a.indices,
         sp_a.values,
@@ -2010,14 +2018,15 @@ def sparse_transpose(sp_input, perm=None, name=None):
       dense_shape = sp_input.dense_shape
       transposed_dense_shape = array_ops.gather(dense_shape, perm)
     transposed_st = sparse_tensor.SparseTensor(
-        transposed_indices, sp_input.values,
-        transposed_dense_shape)
+        transposed_indices, sp_input.values, transposed_dense_shape)
     transposed_st = sparse_reorder(transposed_st)
     return transposed_st
 
 
-def _add_sparse_to_tensors_map(sp_input, container=None,
-                               shared_name=None, name=None):
+def _add_sparse_to_tensors_map(sp_input,
+                               container=None,
+                               shared_name=None,
+                               name=None):
   """Add a `SparseTensor` to a `SparseTensorsMap` and return its handle.
 
   Args:
@@ -2038,12 +2047,18 @@ def _add_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _add_many_sparse_to_tensors_map(sp_input, container=None,
-                                    shared_name=None, name=None):
+def _add_many_sparse_to_tensors_map(sp_input,
+                                    container=None,
+                                    shared_name=None,
+                                    name=None):
   """Add a minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 
   The `SparseTensor` must have rank `R` greater than 1, and the first dimension
@@ -2072,12 +2087,18 @@ def _add_many_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_many_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _take_many_sparse_from_tensors_map(
-    sparse_map_op, sparse_handles, rank=None, name=None):
+def _take_many_sparse_from_tensors_map(sparse_map_op,
+                                       sparse_handles,
+                                       rank=None,
+                                       name=None):
   """Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 
   The input `sparse_handles` must be a string matrix of shape `[N, 1]` where
@@ -2140,16 +2161,18 @@ def _take_many_sparse_from_tensors_map(
     raise TypeError("sparse_map_op be an Operation")
   if sparse_map_op.type not in ("AddSparseToTensorsMap",
                                 "AddManySparseToTensorsMap"):
-    raise TypeError("sparse_map_op must be one of AddSparseToTensorsMap or "
-                    "AddSparseToTensorsMap. Instead, found `%s`." %
-                    sparse_map_op.type)
+    raise TypeError(
+        "sparse_map_op must be one of AddSparseToTensorsMap or "
+        "AddSparseToTensorsMap. Instead, found `%s`." % sparse_map_op.type)
   with ops.colocate_with(sparse_map_op):
     shared_name = sparse_map_op.get_attr("shared_name") or sparse_map_op.name
     output_indices, output_values, output_shape = (
         gen_sparse_ops._take_many_sparse_from_tensors_map(
-            sparse_handles, dtype=sparse_map_op.get_attr("T"),
+            sparse_handles,
+            dtype=sparse_map_op.get_attr("T"),
             container=sparse_map_op.get_attr("container"),
-            shared_name=shared_name, name=name))
+            shared_name=shared_name,
+            name=name))
 
   # Feed rank data back in, if available
   output_indices.set_shape([None, rank])
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 19900870725f5f01c4ba12979265a5533297d4c3..6d7eaababcd94d687ff20dddc35c68a98320a19b 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -155,27 +155,24 @@ def einsum(equation, *inputs, **kwargs):
         indices in its subscript, or
       - the input shapes are inconsistent along a particular axis.
   """
-  name = kwargs.pop("name", None)
+  name = kwargs.pop('name', None)
   if kwargs:
-    raise TypeError("invalid keyword arguments for this function: " +
-                    ", ".join([format(key)
-                               for key in sorted(list(kwargs.keys()))]))
-  with ops.name_scope(name, "einsum", [equation, inputs]) as name:
+    raise TypeError('invalid keyword arguments for this function: ' + ', '.join(
+        [format(key) for key in sorted(list(kwargs.keys()))]))
+  with ops.name_scope(name, 'einsum', [equation, inputs]) as name:
     if '...' in equation:
       raise ValueError('Subscripts with ellipses are not yet supported.')
 
     match = re.match('([a-z,]+)(->[a-z]*)?', equation)
     if not match:
-      raise ValueError(
-          'Indices have incorrect format: %s' % equation
-      )
+      raise ValueError('Indices have incorrect format: %s' % equation)
 
     inputs = list(inputs)
     input_axis_labels = match.group(1).split(',')
 
     if len(inputs) != len(input_axis_labels):
-      raise ValueError('Got %d arguments for equation "%s", expecting %d' % (
-          len(inputs), equation, len(input_axis_labels)))
+      raise ValueError('Got %d arguments for equation "%s", expecting %d' %
+                       (len(inputs), equation, len(input_axis_labels)))
 
     axis_labels = set(''.join(input_axis_labels))
     if match.group(2):
@@ -188,37 +185,36 @@ def einsum(equation, *inputs, **kwargs):
         for ax in axes_:
           counts[ax] += 1
 
-      output_axis_labels = ''.join(sorted(
-          ax for ax in indices
-          if counts[ax] == 1
-      ))
+      output_axis_labels = ''.join(
+          sorted(ax for ax in indices if counts[ax] == 1))
 
     for a in axis_labels:
       input_count = sum(1 for s in input_axis_labels if a in s)
       if input_count > 2 and a not in output_axis_labels:
         logging.warn(
-            'Falling back to exponential-space implementation of einsum() because'
-            ' index "%s" is summed over more than two inputs.', a)
+            'Falling back to exponential-space implementation of einsum()'
+            ' because index "%s" is summed over more than two inputs.', a)
         return _exponential_space_einsum(equation, *inputs)
 
     temp = inputs[0]
     temp_axis_labels = input_axis_labels[0]
-    for i in xrange(len(inputs)-1):
-      axes_to_sum = (set(temp_axis_labels) & set(input_axis_labels[i+1])
-                     - set(output_axis_labels))
-      temp, temp_axis_labels = _einsum_reduction(temp,
-                                                 temp_axis_labels,
-                                                 inputs[i+1],
-                                                 input_axis_labels[i+1],
-                                                 axes_to_sum)
+    for i in xrange(len(inputs) - 1):
+      axes_to_sum = (
+          set(temp_axis_labels) &
+          set(input_axis_labels[i + 1]) - set(output_axis_labels))
+      temp, temp_axis_labels = _einsum_reduction(
+          temp, temp_axis_labels, inputs[i + 1], input_axis_labels[i + 1],
+          axes_to_sum)
 
     missing_indices = set(temp_axis_labels) - set(output_axis_labels)
     if missing_indices:
-      reduction_indices = [i for i, a in enumerate(temp_axis_labels)
-                           if a not in output_axis_labels]
+      reduction_indices = [
+          i for i, a in enumerate(temp_axis_labels)
+          if a not in output_axis_labels
+      ]
       temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
-      temp_axis_labels = ''.join(a for a in temp_axis_labels
-                                 if a in output_axis_labels)
+      temp_axis_labels = ''.join(
+          a for a in temp_axis_labels if a in output_axis_labels)
 
     if sorted(temp_axis_labels) != sorted(output_axis_labels):
       raise ValueError('Invalid equation: %s' % equation)
@@ -296,8 +292,10 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
       return (1, a)
 
   axis_labels = [t0_axis_labels, t1_axis_labels]
-  sorted_axes = [sorted(sym_list, key=lambda a: sort_key(i, a))
-                 for i, sym_list in enumerate(axis_labels)]
+  sorted_axes = [
+      sorted(sym_list, key=lambda a: sort_key(i, a))
+      for i, sym_list in enumerate(axis_labels)
+  ]
   inputs = [t0, t1]
   for i, axes_str in enumerate(axis_labels):
     perm = [axes_str.find(a) for a in sorted_axes[i]]
@@ -325,30 +323,30 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
     num_broadcast_elements_t0 = _total_size(
         t0_shape[len(preserved_axes):-len(axes_to_sum)])
     num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
-    new_shape = (t0_shape[:len(preserved_axes)]
-                 + [num_broadcast_elements_t0, num_summed_elements])
+    new_shape = (
+        t0_shape[:len(preserved_axes)] +
+        [num_broadcast_elements_t0, num_summed_elements])
     t0 = _reshape_if_necessary(t0, new_shape)
 
     t1_shape = _get_shape(t1)
     num_broadcast_elements_t1 = _total_size(
-        t1_shape[len(preserved_axes)+len(axes_to_sum):])
-    new_shape = (t1_shape[:len(preserved_axes)]
-                 + [num_summed_elements, num_broadcast_elements_t1])
+        t1_shape[len(preserved_axes) + len(axes_to_sum):])
+    new_shape = (
+        t1_shape[:len(preserved_axes)] +
+        [num_summed_elements, num_broadcast_elements_t1])
     t1 = _reshape_if_necessary(t1, new_shape)
 
     product = math_ops.matmul(t0, t1)
 
     # Undo compaction of broadcast axes
     uncompacted_shape = (
-        t0_shape[:len(preserved_axes)+len(broadcast_axes[0])]
-        + t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
-    )
+        t0_shape[:len(preserved_axes) + len(broadcast_axes[0])] +
+        t1_shape[len(t1_shape) - len(broadcast_axes[1]):])
     product = _reshape_if_necessary(product, uncompacted_shape)
 
     product_axes = (
-        sorted_axes[0][:len(preserved_axes)+len(broadcast_axes[0])] +
-        sorted_axes[1][len(sorted_axes[1])-len(broadcast_axes[1]):]
-    )
+        sorted_axes[0][:len(preserved_axes) + len(broadcast_axes[0])] +
+        sorted_axes[1][len(sorted_axes[1]) - len(broadcast_axes[1]):])
 
     return product, ''.join(product_axes)
 
@@ -402,13 +400,11 @@ def _total_size(shape_values):
 def _exponential_space_einsum(equation, *inputs):
   """Fallback implementation that supports summing an index over > 2 inputs."""
   if '...' in equation:
-    raise ValueError("Subscripts with ellipses are not yet supported.")
+    raise ValueError('Subscripts with ellipses are not yet supported.')
 
   match = re.match('([a-z,]+)(->[a-z]*)?', equation)
   if not match:
-    raise ValueError(
-        'Indices have incorrect format: %s' % equation
-    )
+    raise ValueError('Indices have incorrect format: %s' % equation)
 
   inputs = list(inputs)
   idx_in = match.group(1).split(',')
@@ -425,21 +421,15 @@ def _exponential_space_einsum(equation, *inputs):
       for ax in axes_:
         counts[ax] += 1
 
-    idx_out = ''.join(sorted(
-        ax for ax in indices
-        if counts[ax] == 1
-    ))
+    idx_out = ''.join(sorted(ax for ax in indices if counts[ax] == 1))
 
   if len(idx_in) != len(inputs):
-    raise ValueError(
-        'Expected %d inputs but got %d' % (len(idx_in), len(inputs))
-    )
+    raise ValueError('Expected %d inputs but got %d' % (len(idx_in),
+                                                        len(inputs)))
 
   missing_idx = set(idx_out).difference(idx_all)
   if missing_idx:
-    raise ValueError(
-        'Unknown output axes: %s' % missing_idx
-    )
+    raise ValueError('Unknown output axes: %s' % missing_idx)
 
   axis_order = {}
   for ax in indices:
@@ -452,18 +442,17 @@ def _exponential_space_einsum(equation, *inputs):
   for i, (input_, axes_) in enumerate(zip(inputs, idx_in)):
     if input_.get_shape().ndims != len(axes_):
       raise ValueError(
-        'Input %d with axes %s has incorrect' \
-        ' number of dimensions (expected %d, got %d)' % (
-          i, axes_, len(axes_), input_.get_shape().ndims
-        )
+          'Input %d with axes %s has incorrect' \
+          ' number of dimensions (expected %d, got %d)' % (
+              i, axes_, len(axes_), input_.get_shape().ndims
+          )
       )
 
     sorted_idx = sorted(axes_, key=axis_order.get)
 
     if len(set(axes_)) != len(axes_):
       raise ValueError(
-          'Subscript not supported: an axis appears more than once: %s' % axes_
-      )
+          'Subscript not supported: an axis appears more than once: %s' % axes_)
 
     if list(axes_) != sorted_idx:
       permuted = [axes_.find(ax) for ax in sorted_idx]
@@ -487,16 +476,15 @@ def _exponential_space_einsum(equation, *inputs):
           dims.append(dim)
 
     if len(set(dims)) > 1:
-      raise ValueError(
-          'Dimension mismatch on axis: %s' % ax
-      )
+      raise ValueError('Dimension mismatch on axis: %s' % ax)
 
     if ax not in idx_out:
       reduction_idx.append(j)
 
   # reshape, multiply
-  expanded_inputs = [array_ops.reshape(input_, shape)
-                     for input_, shape in zip(inputs, shapes)]
+  expanded_inputs = [
+      array_ops.reshape(input_, shape) for input_, shape in zip(inputs, shapes)
+  ]
   expanded_output = 1
   for input_ in expanded_inputs:
     expanded_output *= input_
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index c1a66717d86dd8278dbe676f1714d226351c245f..2c212f45483eacfd3fd27eecb8d7b2c846b5fe96 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -39,8 +39,9 @@ class LBetaTest(test.TestCase):
     x_one_half = [2, 1.]
     with self.test_session(use_gpu=True):
       self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_one)).eval())
-      self.assertAllClose(
-          0.5, math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose(0.5,
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 
   def test_one_dimensional_arg_dynamic(self):
@@ -70,8 +71,9 @@ class LBetaTest(test.TestCase):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
     with self.test_session(use_gpu=True):
-      self.assertAllClose(
-          [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose([0.5, 0.5],
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
 
   def test_two_dimensional_arg_dynamic(self):
@@ -86,10 +88,12 @@ class LBetaTest(test.TestCase):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
     with self.test_session(use_gpu=True):
-      self.assertAllClose(
-          [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose([0.5, 0.5],
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual(
-          (2,), array_ops.shape(special_math_ops.lbeta(x_one_half)).eval())
+          (2,),
+          array_ops.shape(special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual(
           tensor_shape.TensorShape([2]),
           special_math_ops.lbeta(x_one_half).get_shape())
@@ -97,8 +101,8 @@ class LBetaTest(test.TestCase):
   def test_complicated_shape(self):
     with self.test_session(use_gpu=True):
       x = ops.convert_to_tensor(np.random.rand(3, 2, 2))
-      self.assertAllEqual(
-          (3, 2), array_ops.shape(special_math_ops.lbeta(x)).eval())
+      self.assertAllEqual((3, 2),
+                          array_ops.shape(special_math_ops.lbeta(x)).eval())
       self.assertEqual(
           tensor_shape.TensorShape([3, 2]),
           special_math_ops.lbeta(x).get_shape())
@@ -155,7 +159,6 @@ class EinsumTest(test.TestCase):
       'ijk->i',
       'ijk->kji',
       'ji,kj->ik',
-
       'ikl,kji->kl',
       'klj,lki->ij',
       'ijk,ilj->kli',
@@ -164,7 +167,6 @@ class EinsumTest(test.TestCase):
       'i,ijk,j->k',
       'ij,ij,jk,kl->il',
       'ij,kj,il,jm->ml',
-
       'a,ab,abc->abc',
       'a,b,ab->ab',
       'ab,ab,c->',
@@ -173,25 +175,21 @@ class EinsumTest(test.TestCase):
       'ab,ab,cd,cd->ac',
       'ab,ab,cd,cd->cd',
       'ab,ab,cd,cd,ef,ef->',
-
       'ab,cd,ef->abcdef',
       'ab,cd,ef->acdf',
       'ab,cd,de->abcde',
       'ab,cd,de->be',
       'ab,bcd,cd->abcd',
       'ab,bcd,cd->abd',
-
       'eb,cb,fb->cef',
       'abcd,ad',
       'bd,db,eac->ace',
       'ba,ac,da->bcd',
-
       'ab,ab',
       'ab,ba',
       'abc,abc',
       'abc,bac',
       'abc,cba',
-
       'dba,ead,cad->bce',
       'aef,fbc,dca->bde',
   ]
@@ -234,10 +232,8 @@ class EinsumTest(test.TestCase):
   def test_invalid(self):
     for axes in self.invalid_cases:
       inputs = [
-          array_ops.placeholder(
-              dtypes.float32, shape=(3, 4)),
-          array_ops.placeholder(
-              dtypes.float32, shape=(3, 4)),
+          array_ops.placeholder(dtypes.float32, shape=(3, 4)),
+          array_ops.placeholder(dtypes.float32, shape=(3, 4)),
       ]
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
@@ -245,16 +241,22 @@ class EinsumTest(test.TestCase):
   def test_invalid_keyword_arguments(self):
     m0 = array_ops.placeholder(dtypes.int32, shape=(1, None))
     m1 = array_ops.placeholder(dtypes.int32, shape=(None, 1))
-    with self.assertRaisesRegexp(TypeError,
+    with self.assertRaisesRegexp(
+        TypeError,
         'invalid keyword arguments for this function: invalid1, invalid2'):
-      _ = special_math_ops.einsum('ij,jk->ik', m0, m1, name="name",
-                                  invalid1="value1", invalid2="value2")
+      _ = special_math_ops.einsum(
+          'ij,jk->ik',
+          m0,
+          m1,
+          name='name',
+          invalid1='value1',
+          invalid2='value2')
 
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
       inputs = [
-          array_ops.placeholder(
-              dtypes.float32, shape=shape) for shape in input_shapes
+          array_ops.placeholder(dtypes.float32, shape=shape)
+          for shape in input_shapes
       ]
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
@@ -291,8 +293,8 @@ class EinsumTest(test.TestCase):
             m0: [[1, 2, 3]],
             m1: [[2], [1], [1]],
         }
-        np.testing.assert_almost_equal(
-            [[7]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[7]], sess.run(
+            out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 3))
@@ -312,11 +314,11 @@ class EinsumTest(test.TestCase):
       out = special_math_ops.einsum('ijk,kl->ijl', m0, m1)
       with session.Session() as sess:
         feed_dict = {
-            m0: [[[1,2]]],
+            m0: [[[1, 2]]],
             m1: [[3], [2]],
         }
-        np.testing.assert_almost_equal(
-            [[[7]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(2, 1))
@@ -325,10 +327,10 @@ class EinsumTest(test.TestCase):
       with session.Session() as sess:
         feed_dict = {
             m0: [[3], [2]],
-            m1: [[[1,2]]],
+            m1: [[[1, 2]]],
         }
-        np.testing.assert_almost_equal(
-            [[[7]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, None, 2))
@@ -339,8 +341,8 @@ class EinsumTest(test.TestCase):
             m0: [[[1, 2]]],
             m1: [3, 2],
         }
-        np.testing.assert_almost_equal(
-           [[7]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[7]], sess.run(
+            out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2))
@@ -351,8 +353,8 @@ class EinsumTest(test.TestCase):
             m0: [[[[1, 2]], [[2, 1]]]],
             m1: [[3, 2]],
         }
-        np.testing.assert_almost_equal(
-            [[[7, 8]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7, 8]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 30bf4e4ef1b96ea68e9020621f37551ac619a3c2..737b923415f4a36ee7886ec0eb2e6dfa78be1de4 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -26,6 +26,7 @@ import sys as _sys
 from tensorflow.python.ops import array_grad
 from tensorflow.python.ops import data_flow_grad
 from tensorflow.python.ops import math_grad
+from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
@@ -59,6 +60,7 @@ from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
 from tensorflow.python.ops.lookup_ops import initialize_all_tables
 from tensorflow.python.ops.lookup_ops import tables_initializer
+from tensorflow.python.ops.manip_ops import *
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -105,6 +107,7 @@ from tensorflow.python.ops import init_ops as _init_ops
 from tensorflow.python.ops import io_ops as _io_ops
 from tensorflow.python.ops import linalg_ops as _linalg_ops
 from tensorflow.python.ops import logging_ops as _logging_ops
+from tensorflow.python.ops import manip_ops as _manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops import numerics as _numerics
 from tensorflow.python.ops import parsing_ops as _parsing_ops
@@ -280,6 +283,7 @@ remove_undocumented(__name__, _allowed_symbols,
                      _io_ops,
                      _linalg_ops,
                      _logging_ops,
+                     _manip_ops,
                      _math_ops,
                      _numerics,
                      _parsing_ops,
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 84449e00beb4d2901f57c7cd41a4e755fe343c8c..806fdd3da7aa6de01b7cd4d9d36dbf43f6139db6 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -140,7 +140,7 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
     re-enter the scope and reuse those variables.
 
   Raises:
-    ValueError: if the name is None.
+    ValueError: if `name_` is None.
   """
   return make_template_internal(
       name_,
@@ -176,16 +176,14 @@ def make_template_internal(name_,
     custom_getter_: Optional custom getter for variables used in `func_`. See
       the @{tf.get_variable} `custom_getter` documentation for
       more information.
-    create_graph_function_: When True, the first invocation of the template will
-      execute `func_` as is, to allow for variable creation; however, the second
-      invocation and every invocation thereafter will execute func as a graph
-      function.  In particular, this implies that `func_` must satisfy the
-      properties that `function.defun` requires of functions: See the
-      documentation of `function.defun` for details. When executing eagerly,
-      setting this flag to True can improve performance. Regardless of whether
-      eager execution is enabled, enabling this flag gives the caller access to
-      graph-function semantics, i.e., accesses to variables are totally ordered
-      and side-effecting ops are not pruned.
+    create_graph_function_: When True, `func_` will be executed as a graph
+      function. This implies that `func_` must satisfy the properties that
+      `function.defun` requires of functions: See the documentation of
+      `function.defun` for details. When executing eagerly, setting this flag to
+      True can improve performance. Regardless of whether eager execution is
+      enabled, enabling this flag gives the caller access to graph-function
+      semantics, i.e., accesses to variables are totally ordered and
+      side-effecting ops are not pruned.
     **kwargs: Keyword arguments to apply to `func_`.
 
   Returns:
@@ -198,8 +196,8 @@ def make_template_internal(name_,
     re-enter the scope and reuse those variables.
 
   Raises:
-    ValueError: if the name is None.
-    ValueError: if unique_name_ is not None and eager execution is enabled.
+    ValueError: if `name_` is None.
+    ValueError: if `unique_name_` is not None and eager execution is enabled.
   """
 
   if kwargs:
@@ -266,18 +264,18 @@ class Template(object):
         template of the same scope/unique_name already exists and reuse is
         false, an error is raised. Defaults to None.
       custom_getter: optional custom getter to pass to `variable_scope()`
-      create_graph_function: When True, the first invocation of the template
-        will execute `func` as is, to allow for variable creation; however, the
-        second invocation and every invocation thereafter will execute `func` as
-        a graph function. Enabling this flag gives the caller access to
-        graph-function semantics, i.e., accesses to variables are totally
-        ordered and side-effecting ops are not pruned.
-
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag gives the caller access to graph-function
+        semantics, i.e., accesses to variables are totally ordered and
+        side-effecting ops are not pruned.
 
     Raises:
-      ValueError: if the name is None.
+      ValueError: if `name` is None.
     """
-    self._func = func
+    if create_graph_function:
+      self._func = function.defun(func)
+    else:
+      self._func = func
     self._stacktrace = traceback.format_stack()[:-2]
     self._name = name
     self._unique_name = unique_name
@@ -295,19 +293,13 @@ class Template(object):
     # This variable keeps track of whether the template has been called yet,
     # which is not the same as whether the scope has been created.
     self._variables_created = False
-    self._create_graph_function = create_graph_function
 
   def _call_func(self, args, kwargs):
     try:
       vars_at_start = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
       trainable_at_start = len(
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
       result = self._func(*args, **kwargs)
-      if self._create_graph_function and not self._variables_created:
-        # Only execute self._func as a graph function once variables are
-        # created.
-        self._func = function.defun(self._func)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
@@ -542,14 +534,11 @@ class EagerTemplate(Template):
         names of all created Tensors. If set to False, the scope will be created
         at the first call location.
       custom_getter: optional custom getter to pass to `variable_scope()`
-      create_graph_function: When True, the first invocation of the template
-        will execute `func` as is, to allow for variable creation; however, the
-        second invocation and every invocation thereafter will execute `func` as
-        a graph function. Enabling this flag allows the caller to reap the
-        performance benefits associated with executing graphs, at the cost of
-        sacrificing debuggability; however, not all functions can be compiled
-        into graph functions. See the documentation for `function.defun` for
-        details.
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag allows the caller to reap the performance
+        benefits associated with executing graphs, at the cost of sacrificing
+        debuggability; however, not all Python functions can be compiled into
+        graph functions. See the documentation for `function.defun` for details.
 
     Raises:
       RuntimeError: if eager execution is not enabled.
@@ -573,12 +562,7 @@ class EagerTemplate(Template):
     try:
       vars_at_start = self._template_store.variables()
       trainable_at_start = self._template_store.trainable_variables()
-
       result = self._func(*args, **kwargs)
-      if self._create_graph_function and not self._variables_created:
-        # Only execute self._func as a graph function once variables are
-        # created.
-        self._func = function.defun(self._func)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c52d5fff5d7dd250157627bdcfee51382faef1b5..81565a63774da49628d100ef071b02f6311f6af2 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -27,6 +27,7 @@ import sys
 import traceback
 
 import six
+from six import iteritems
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
@@ -770,8 +771,8 @@ class _VariableStore(object):
     if initializer is None:
       initializer, initializing_from_value = self._get_default_initializer(
           name=name, shape=shape, dtype=dtype)
-    # Clear control dependencies while creating the initializer.
-    with ops.control_dependencies(None):
+    # Enter an init scope when creating the initializer.
+    with ops.init_scope():
       if initializing_from_value:
         init_val = initializer
         variable_dtype = None
@@ -1242,6 +1243,36 @@ class EagerVariableStore(object):
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
+  def copy(self):
+    """Copy this variable store and all of its contents.
+
+    Variables contained in this store will be copied over to the new variable
+    store, meaning that they can be modified without affecting the variables in
+    this store.
+
+    Returns:
+      A new EagerVariableStore instance containing copied variables.
+    """
+    # pylint: disable=protected-access
+    new_store = EagerVariableStore()
+    for key, var in iteritems(self._store._vars):
+      # Strip device out of variable name.
+      try:
+        index = var.name.index(":")
+      except ValueError:
+        stripped_var_name = var.name
+      else:
+        stripped_var_name = var.name[:index]
+
+      # Create new variable with same value, name, and "trainable" flag.
+      new_var = resource_variable_ops.ResourceVariable(
+          var.read_value(),
+          name=stripped_var_name,
+          trainable=var._trainable)
+      new_store._store._vars[key] = new_var
+    return new_store
+    # pylint: enable=protected-access
+
 
 @tf_export("get_variable")
 def get_variable(name,
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 7d7fa646c08523c5f572f8f4593c1d8fe8615c67..19e3298e4019f94132db25ab0dae5ed458bfbeb3 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
@@ -211,6 +212,7 @@ class Variable(object):
     if not context.in_graph_mode():
       raise RuntimeError("tf.Variable not supported in Eager mode. "
                          "Please use tfe.Variable instead")
+    self._in_graph_mode = context.in_graph_mode()
     if variable_def:
       # If variable_def is provided, recreates the variable from its fields.
       if initial_value:
@@ -306,7 +308,7 @@ class Variable(object):
 
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
 
@@ -377,8 +379,8 @@ class Variable(object):
         else:
           with ops.colocate_with(self._variable.op):
             self._snapshot = array_ops.identity(self._variable, name="read")
+      ops.add_to_collections(collections, self)
 
-    ops.add_to_collections(collections, self)
     self._caching_device = caching_device
     self._save_slice_info = None
     self._constraint = constraint
@@ -552,7 +554,7 @@ class Variable(object):
       A `Tensor` holding the value of this variable after its initializer
       has run.
     """
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       return control_flow_ops.cond(is_variable_initialized(self),
                                    self.read_value,
                                    lambda: self.initial_value)
@@ -1021,6 +1023,61 @@ class Variable(object):
     return Variable(variable_def=variable_def,
                     import_scope=import_scope)
 
+  def __iadd__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable += will be deprecated. Use variable.assign_add"
+        " if you want assignment to the variable value or 'x = x + y'"
+        " if you want a new python Tensor object.", 1)
+    return self + other
+
+  def __isub__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable -= will be deprecated. Use variable.assign_sub"
+        " if you want assignment to the variable value or 'x = x - y'"
+        " if you want a new python Tensor object.", 1)
+    return self - other
+
+  def __imul__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable *= will be deprecated. Use variable.assign_mul"
+        " if you want assignment to the variable value or 'x = x * y'"
+        " if you want a new python Tensor object.", 1)
+    return self * other
+
+  def __idiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __itruediv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __irealdiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __ipow__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable **= will be deprecated. Use 'x = x ** y'"
+        " if you want a new python Tensor object.", 1)
+    return self ** other
+
   class SaveSliceInfo(object):
     """Information on how to save this Variable as a slice.
 
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 9b92d9a18005ca5e6be3820427e3a3ba60a8ec2d..cce64c0ccafc29a9d0d0b51b4c97c5673264657b 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -23,6 +23,7 @@ import sys as _sys
 
 from tensorflow.python.platform import flags
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _usage(shorthelp):
@@ -108,6 +109,7 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
+@tf_export('app.run')
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index 2455acb4c0c469acbb928c4ec44571e50e06de1f..8f7b12e2b2b92d9b2bfe397d0e7cba59e11bc1f6 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -29,8 +29,10 @@ import sys as _sys
 
 from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('resource_loader.load_resource')
 def load_resource(path):
   """Load the resource at given path, where path is relative to tensorflow/.
 
@@ -52,6 +54,7 @@ def load_resource(path):
 
 
 # pylint: disable=protected-access
+@tf_export('resource_loader.get_data_files_path')
 def get_data_files_path():
   """Get a direct path to the data files colocated with the script.
 
@@ -62,6 +65,7 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
+@tf_export('resource_loader.get_root_dir_with_all_resources')
 def get_root_dir_with_all_resources():
   """Get a root directory containing all the data attributes in the build rule.
 
@@ -101,6 +105,7 @@ def get_root_dir_with_all_resources():
   return data_files_dir or script_dir
 
 
+@tf_export('resource_loader.get_path_to_datafile')
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -120,6 +125,7 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
+@tf_export('resource_loader.readahead_file_path')
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
diff --git a/tensorflow/python/platform/stacktrace_handler_test.py b/tensorflow/python/platform/stacktrace_handler_test.py
index 3f0e534f4cbd97ecbd7db1fae3b48af72310c24f..f2071f9d54ceb99831999ec08ab71d63862f1c36 100644
--- a/tensorflow/python/platform/stacktrace_handler_test.py
+++ b/tensorflow/python/platform/stacktrace_handler_test.py
@@ -57,7 +57,8 @@ class StacktraceHandlerTest(test.TestCase):
 
     # Capture its output. capture both stdout and stderr and append them.
     # We are not worried about timing or order of messages in this test.
-    child_output = child_process.stdout.read() + child_process.stderr.read()
+    child_stdout, child_stderr = child_process.communicate()
+    child_output = child_stdout + child_stderr
 
     # Make sure the child process is dead before we proceed.
     child_process.wait()
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 85ed4f071c7022801f20db75d538e5917b8eea66..22aabfd7121ac9b2eebeae2693f174e044d504ef 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -35,6 +35,7 @@ import threading
 import six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Don't use this directly. Use _get_logger() instead.
@@ -90,30 +91,37 @@ def _get_logger():
     _logger_lock.release()
 
 
+@tf_export('logging.log')
 def log(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
 
+@tf_export('logging.debug')
 def debug(msg, *args, **kwargs):
   _get_logger().debug(msg, *args, **kwargs)
 
 
+@tf_export('logging.error')
 def error(msg, *args, **kwargs):
   _get_logger().error(msg, *args, **kwargs)
 
 
+@tf_export('logging.fatal')
 def fatal(msg, *args, **kwargs):
   _get_logger().fatal(msg, *args, **kwargs)
 
 
+@tf_export('logging.info')
 def info(msg, *args, **kwargs):
   _get_logger().info(msg, *args, **kwargs)
 
 
+@tf_export('logging.warn')
 def warn(msg, *args, **kwargs):
   _get_logger().warn(msg, *args, **kwargs)
 
 
+@tf_export('logging.warning')
 def warning(msg, *args, **kwargs):
   _get_logger().warning(msg, *args, **kwargs)
 
@@ -136,15 +144,18 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
+@tf_export('logging.TaskLevelStatusMessage')
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
+@tf_export('logging.flush')
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
+@tf_export('logging.vlog')
 def vlog(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
@@ -164,6 +175,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
+@tf_export('logging.log_every_n')
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -180,6 +192,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
+@tf_export('logging.log_first_n')
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -195,6 +208,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
+@tf_export('logging.log_if')
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -251,11 +265,13 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
+@tf_export('logging.get_verbosity')
 def get_verbosity():
   """Return how much logging output will be produced."""
   return _get_logger().getEffectiveLevel()
 
 
+@tf_export('logging.set_verbosity')
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
   _get_logger().setLevel(v)
@@ -296,4 +312,10 @@ _allowed_symbols = [
     'warning',
 ]
 
+tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
+tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
+tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
+tf_export('logging.INFO').export_constant(__name__, 'INFO')
+tf_export('logging.WARN').export_constant(__name__, 'WARN')
+
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 8f780545607f7ba2337c83ad2c3740f542b802f6..0e20ca35bba606079ed5b0f225dd3029772b5af3 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.profiler import option_builder
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 _DEFAULT_PROFILE_OPTIONS = 0
 _DEFAULT_ADVISE_OPTIONS = 0
@@ -121,6 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
+@tf_export('profiler.Profiler')
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -304,6 +306,7 @@ class Profiler(object):
     print_mdl.WriteProfile(filename)
 
 
+@tf_export('profiler.profile')
 def profile(graph=None,
             run_meta=None,
             op_log=None,
@@ -378,6 +381,7 @@ def profile(graph=None,
   return tfprof_node
 
 
+@tf_export('profiler.advise')
 def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 915385558889e64277611bd71251f8f937a18159..04ba28c219e276e1ca79bd4e20e7d1b6ee700db5 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -224,15 +224,15 @@ class PrintModelAnalysisTest(test.TestCase):
         # pylint: disable=line-too-long
         with gfile.Open(outfile, 'r') as f:
           lines = f.read().split('\n')
+          self.assertGreater(len(lines), 5)
           result = '\n'.join([l[:min(len(l), 80)] for l in lines])
-          self.assertEqual(
-              compat.as_bytes(
-                  'node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/168.86k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (0/1.80k params, 0/45.37k flops)\n    model_analyzer_testlib.py:40:BuildSmallModel (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:44:BuildSmallModel (0/4 params, 0/8 flops)\n    model_analyzer_testlib.py:48:BuildSmallModel (0/648 params, 0/1.30k flops)\n    model_analyzer_testlib.py:49:BuildSmallModel (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:53:BuildSmallModel (0/1.15k params, 0/2.30k flops)\n    model_analyzer_testlib.py:54:BuildSmallModel (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (gradient) (0/0 params, 0/67.39k f\n    model_analyzer_testlib.py:49:BuildSmallModel (gradient) (0/0 params, 0/46.66\n    model_analyzer_testlib.py:54:BuildSmallModel (gradient) (0/0 params, 0/20.74\n  model_analyzer_testlib.py:67:BuildFullModel (0/1.04k params, 0/18.58k flops)\n  model_analyzer_testlib.py:67:BuildFullModel (gradient) (0/0 params, 0/37.00k f\n  model_analyzer_testlib.py:69:BuildFullModel (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (0/0 params, 0/258 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (gradient) (0/0 params, 0/129 flop\n  model_analyzer_testlib.py:72:BuildFullModel (0/0 params, 0/141 flops)\n'
-              ), compat.as_bytes(lib.CheckAndRemoveDoc(result)))
+          self.assertTrue(
+              compat.as_text(lib.CheckAndRemoveDoc(result))
+              .startswith('node name | # parameters | # float_ops'))
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertEqual(168863, tfprof_node.total_float_ops)
+        self.assertLess(168800, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 13942ad6a2adc1f1d1cad778ebd280d358f64a59..957ebe6dddc26118024f71cadef861f38f1805e0 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('profiler.ProfileOptionBuilder')
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index ffda7ddad759ce68bf718bcfa6e568cfadd59b53..8d121064967f2f87cd0aefaa361bfd6f387a3e6e 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import gfile
 from tensorflow.python.profiler.internal import flops_registry  # pylint: disable=unused-import
+from tensorflow.python.util.tf_export import tf_export
 
 TRAINABLE_VARIABLES = '_trainable_variables'
 REGISTERED_FLOP_STATS = 'flops'
@@ -187,6 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
+@tf_export('profiler.write_op_log')
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 5ff954fd9f83989565e007cad3f0f66913e0a4dd..6e85df0cbf5623691d38b030036958e5955399ee 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -232,13 +232,9 @@ def load(sess, tags, export_dir, **saver_kwargs):
     asset_tensors_dictionary = _get_asset_tensors(export_dir,
                                                   meta_graph_def_to_load)
 
-    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    main_op_tensor = (_get_main_op_tensor(meta_graph_def_to_load) or
+                      (_get_legacy_init_op_tensor(meta_graph_def_to_load)))
     if main_op_tensor is not None:
       sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-    else:
-      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-      if legacy_init_op_tensor is not None:
-        sess.run(
-            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
 
     return meta_graph_def_to_load
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 1ea619ff55dea00f8ee09024ab45dcd324a2ddce..f92247d52e4150b2347a95e84d4bbf9c6ffc258a 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -54,8 +54,14 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
+@test_util.with_c_api
 class SavedModelTest(test.TestCase):
 
+  def _get_export_dir(self, label):
+    if ops._USE_C_API:
+      label += "_c_api"
+    return os.path.join(test.get_temp_dir(), label)
+
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
     sess.run(variables.global_variables_initializer())
@@ -123,8 +129,7 @@ class SavedModelTest(test.TestCase):
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
 
   def testBadSavedModelFileFormat(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_bad_saved_model_file_format")
+    export_dir = self._get_export_dir("test_bad_saved_model_file_format")
     # Attempt to load a SavedModel from an export directory that does not exist.
     with self.test_session(graph=ops.Graph()) as sess:
       with self.assertRaisesRegexp(IOError,
@@ -157,8 +162,7 @@ class SavedModelTest(test.TestCase):
         loader.load(sess, ["foo"], export_dir)
 
   def testVerifySessionGraphUsage(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_verify_session_graph_usage")
+    export_dir = self._get_export_dir("test_verify_session_graph_usage")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -178,7 +182,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testSequence(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_sequence")
+    export_dir = self._get_export_dir("test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
@@ -195,7 +199,7 @@ class SavedModelTest(test.TestCase):
                         sess, ["baz"])
 
   def testTags(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    export_dir = self._get_export_dir("test_tags")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -284,7 +288,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_variables")
+    export_dir = self._get_export_dir("test_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
@@ -336,7 +340,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testGraphWithoutVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_graph_has_variables")
+    export_dir = self._get_export_dir("test_graph_has_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with no variables.
@@ -371,7 +375,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(30.0, sess.run(c))
 
   def testNoOverwrite(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_no_overwrite")
+    export_dir = self._get_export_dir("test_no_overwrite")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -395,7 +399,7 @@ class SavedModelTest(test.TestCase):
                       export_dir)
 
   def testSaveAsText(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_astext")
+    export_dir = self._get_export_dir("test_astext")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -426,7 +430,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testCollections(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_collections")
+    export_dir = self._get_export_dir("test_collections")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
@@ -476,7 +480,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
   def testSignatureDefs(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_signature_defs")
+    export_dir = self._get_export_dir("test_signature_defs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
@@ -536,8 +540,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual("foo_new", bar_signature["foo_key"].method_name)
 
   def testSignatureDefValidation(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_signature_def_validation")
+    export_dir = self._get_export_dir("test_signature_def_validation")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     tensor_without_name = meta_graph_pb2.TensorInfo()
@@ -555,7 +558,7 @@ class SavedModelTest(test.TestCase):
     self._validate_outputs_tensor_info(builder, tensor_empty)
 
   def testAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_assets")
+    export_dir = self._get_export_dir("test_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -588,7 +591,7 @@ class SavedModelTest(test.TestCase):
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
   def testCustomMainOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_main_op")
+    export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -623,7 +626,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_legacy_init_op")
+    export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -657,8 +660,8 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOpWithNonEmptyCollection(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_legacy_init_op_with_non_empty_collection")
+    export_dir = self._get_export_dir(
+        "test_legacy_init_op_with_non_empty_collection")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -685,7 +688,7 @@ class SavedModelTest(test.TestCase):
             sess, ["foo"], legacy_init_op=legacy_init_op)
 
   def testMultipleAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_multiple_assets")
+    export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -727,7 +730,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testDuplicateAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_duplicate_assets")
+    export_dir = self._get_export_dir("test_duplicate_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -775,7 +778,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_op")
+    export_dir = self._get_export_dir("test_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -818,7 +821,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testCustomSaveable(self):
-    export_dir = os.path.join(test.get_temp_dir(), "custom_saveable")
+    export_dir = self._get_export_dir("custom_saveable")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -847,7 +850,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3.0, v1.values().eval())
 
   def testClearDevices(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_clear_devices")
+    export_dir = self._get_export_dir("test_clear_devices")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
@@ -871,7 +874,9 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testStripDefaultAttrs(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_strip_default_attrs")
+    if ops._USE_C_API: return  # TODO(skyewm): get this working
+
+    export_dir = self._get_export_dir("test_strip_default_attrs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Add a graph with two float32 variables and a Complex Op composing them
@@ -941,8 +946,10 @@ class SavedModelTest(test.TestCase):
     self.assertIn("Tout", node_def.attr)
 
   def testStripDefaultAttrsInconsistentConsumerDefaults(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_strip_default_attrs_no_consumer_defaults")
+    if ops._USE_C_API: return  # TODO(skyewm): get this working
+
+    export_dir = self._get_export_dir(
+        "test_strip_default_attrs_no_consumer_defaults")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Add a graph with two float32 variables and a Complex Op composing them
diff --git a/tensorflow/python/saved_model/simple_save.py b/tensorflow/python/saved_model/simple_save.py
index 9a81e5cd80705482865e05b040d712418a993da1..1e4cc733706c774107184c2fb19037da3d4e733d 100644
--- a/tensorflow/python/saved_model/simple_save.py
+++ b/tensorflow/python/saved_model/simple_save.py
@@ -40,17 +40,20 @@ def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
     - It will be treated as a graph for inference / serving (i.e. uses the tag
       `tag_constants.SERVING`)
     - The SavedModel will load in TensorFlow Serving and supports the
-      [Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
+      [Predict
+      API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
       To use the Classify, Regress, or MultiInference APIs, please
       use either
       [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
       or the lower level
-      [SavedModel APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+      [SavedModel
+      APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
     - Some TensorFlow ops depend on information on disk or other information
       called "assets". These are generally handled automatically by adding the
       assets to the `GraphKeys.ASSET_FILEPATHS` collection. Only assets in that
       collection are exported; if you need more custom behavior, you'll need to
-      use the [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
+      use the
+      [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
 
   More information about SavedModel and signatures can be found here:
   https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md.
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 12f120116f4439059f42c7212469ee835cc13ef4..1f3f2287043c021d636113b5a8807c9f4adf77aa 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
 
@@ -276,6 +277,7 @@ class SummaryToEventTransformer(object):
     self.event_writer.add_event(event)
 
 
+@tf_export("summary.FileWriter")
 class FileWriter(SummaryToEventTransformer):
   """Writes `Summary` protocol buffers to event files.
 
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index bad289303c0fd0de7836b03a6762d04505521a89..645fa28a37fb125b6b1224961251bc8879d5fe6d 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -22,8 +22,10 @@ import threading
 
 from tensorflow.python.framework import ops
 from tensorflow.python.summary.writer.writer import FileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('summary.FileWriterCache')
 class FileWriterCache(object):
   """Cache for file writers.
 
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 0ddf09260b6865b4bac5b580459e6080dae7ada0..fd78f44c999e41f404a6499a0addd2131da6d287 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -72,7 +72,8 @@ def freeze_graph_with_def_protos(input_graph_def,
                                  variable_names_blacklist="",
                                  input_meta_graph_def=None,
                                  input_saved_model_dir=None,
-                                 saved_model_tags=None):
+                                 saved_model_tags=None,
+                                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   del restore_op_name, filename_tensor_name  # Unused by updated loading code.
 
@@ -100,7 +101,8 @@ def freeze_graph_with_def_protos(input_graph_def,
     _ = importer.import_graph_def(input_graph_def, name="")
   with session.Session() as sess:
     if input_saver_def:
-      saver = saver_lib.Saver(saver_def=input_saver_def)
+      saver = saver_lib.Saver(saver_def=input_saver_def,
+                              write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
     elif input_meta_graph_def:
       restorer = saver_lib.import_meta_graph(
@@ -124,7 +126,8 @@ def freeze_graph_with_def_protos(input_graph_def,
           # 'global_step' or a similar housekeeping element) so skip it.
           continue
         var_list[key] = tensor
-      saver = saver_lib.Saver(var_list=var_list)
+      saver = saver_lib.Saver(var_list=var_list,
+                              write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
       if initializer_nodes:
         sess.run(initializer_nodes.split(","))
@@ -217,7 +220,8 @@ def freeze_graph(input_graph,
                  variable_names_blacklist="",
                  input_meta_graph=None,
                  input_saved_model_dir=None,
-                 saved_model_tags=tag_constants.SERVING):
+                 saved_model_tags=tag_constants.SERVING,
+                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   input_graph_def = None
   if input_saved_model_dir:
@@ -236,7 +240,8 @@ def freeze_graph(input_graph,
       input_graph_def, input_saver_def, input_checkpoint, output_node_names,
       restore_op_name, filename_tensor_name, output_graph, clear_devices,
       initializer_nodes, variable_names_whitelist, variable_names_blacklist,
-      input_meta_graph_def, input_saved_model_dir, saved_model_tags.split(","))
+      input_meta_graph_def, input_saved_model_dir,
+      saved_model_tags.split(","), checkpoint_version=checkpoint_version)
 
 
 def main(unused_args):
@@ -246,7 +251,7 @@ def main(unused_args):
                FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
                FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
                FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags)
+               FLAGS.saved_model_tags, FLAGS.checkpoint_version)
 
 
 if __name__ == "__main__":
@@ -267,6 +272,11 @@ if __name__ == "__main__":
       type=str,
       default="",
       help="TensorFlow variables file to load.")
+  parser.add_argument(
+      "--checkpoint_version",
+      type=int,
+      default=saver_pb2.SaverDef.V2,
+      help="Tensorflow variable file format")
   parser.add_argument(
       "--output_graph",
       type=str,
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index feeed7102cd49a79d0280cc04431de00ad3286d5..342732465d48f40a4ffeac97146fb1b6d564c568 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -86,7 +86,8 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
     freeze_graph.freeze_graph(
         input_graph_path, input_saver_def_path, input_binary, checkpoint_path,
         output_node_names, restore_op_name, filename_tensor_name,
-        output_graph_path, clear_devices, "", "", input_meta_graph)
+        output_graph_path, clear_devices, "", "", input_meta_graph,
+        checkpoint_version=saver_write_version)
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 8716058e619d8e970834ec4d57e4d8ff21559d5c..dd876cbe7fcd64a8de70eb28f67996df9de1dd7d 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -97,8 +97,9 @@ def parse_numpy_printoption(kv_str):
     raise argparse.ArgumentTypeError(
         "Setting '%s' from the command line is not supported." % k)
   try:
-    v = (v_type(v_str) if v_type is not bool
-         else flags.BooleanParser().parse(v_str))
+    v = (
+        v_type(v_str)
+        if v_type is not bool else flags.BooleanParser().parse(v_str))
   except ValueError as e:
     raise argparse.ArgumentTypeError(e.message)
   np.set_printoptions(**{k: v})
@@ -121,9 +122,12 @@ if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.register("type", "bool", lambda v: v.lower() == "true")
   parser.add_argument(
-      "--file_name", type=str, default="", help="Checkpoint filename. "
-                    "Note, if using Checkpoint V2 format, file_name is the "
-                    "shared prefix between all files in the checkpoint.")
+      "--file_name",
+      type=str,
+      default="",
+      help="Checkpoint filename. "
+      "Note, if using Checkpoint V2 format, file_name is the "
+      "shared prefix between all files in the checkpoint.")
   parser.add_argument(
       "--tensor_name",
       type=str,
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index c2687bf557b03ff588fd369771077c92ba012a15..9c1927122252f45ddfa8092045c7589fa0f45532 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -349,6 +349,7 @@ def fold_batch_norms(input_graph_def):
     bias_add_op.op = "BiasAdd"
     bias_add_op.name = node.name
     bias_add_op.attr["T"].CopyFrom(conv_op.attr["T"])
+    bias_add_op.attr["data_format"].CopyFrom(conv_op.attr["data_format"])
     bias_add_op.input.extend([new_conv_op.name, offset_op.name])
     new_ops.extend([scaled_weights_op, new_conv_op, offset_op, bias_add_op])
 
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 6dd24c0dca1d326592e4f33eba4e6233248dac5f..2ef612473b4bb64f611983f87f8cdd619a2d8a38 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import image_ops
@@ -38,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
 
 
+@test_util.with_c_api
 class OptimizeForInferenceTest(test.TestCase):
 
   def create_node_def(self, op, name, inputs):
@@ -145,7 +147,7 @@ class OptimizeForInferenceTest(test.TestCase):
           np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
       gamma_op = constant_op.constant(
           np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 8
+      test_util.set_producer_version(ops.get_default_graph(), 8)
       gen_nn_ops._batch_norm_with_global_normalization(
           conv_op,
           mean_op,
@@ -171,48 +173,53 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
   def testFoldFusedBatchNorms(self):
-    with self.test_session() as sess:
-      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
-      input_op = constant_op.constant(
-          np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32)
-      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
-      weights_op = constant_op.constant(
-          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
-      conv_op = nn_ops.conv2d(
-          input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op")
-      mean_op = constant_op.constant(
-          np.array([10, 20]), shape=[2], dtype=dtypes.float32)
-      variance_op = constant_op.constant(
-          np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
-      beta_op = constant_op.constant(
-          np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
-      gamma_op = constant_op.constant(
-          np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 9
-      gen_nn_ops._fused_batch_norm(
-          conv_op,
-          gamma_op,
-          beta_op,
-          mean_op,
-          variance_op,
-          0.00001,
-          is_training=False,
-          name="output")
-      original_graph_def = sess.graph_def
-      original_result = sess.run(["output:0"])
-    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
-        original_graph_def)
-
-    with self.test_session() as sess:
-      _ = importer.import_graph_def(
-          optimized_graph_def, input_map={}, name="optimized")
-      optimized_result = sess.run(["optimized/output:0"])
-
-    self.assertAllClose(
-        original_result, optimized_result, rtol=1e-04, atol=1e-06)
-
-    for node in optimized_graph_def.node:
-      self.assertNotEqual("FusedBatchNorm", node.op)
+    for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
+        input_op = constant_op.constant(
+            np.array(inputs),
+            shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6],
+            dtype=dtypes.float32)
+        weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
+        weights_op = constant_op.constant(
+            np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
+        conv_op = nn_ops.conv2d(
+            input_op, weights_op, [1, 1, 1, 1], padding="SAME",
+            data_format=data_format, name="conv_op")
+        mean_op = constant_op.constant(
+            np.array([10, 20]), shape=[2], dtype=dtypes.float32)
+        variance_op = constant_op.constant(
+            np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
+        beta_op = constant_op.constant(
+            np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
+        gamma_op = constant_op.constant(
+            np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
+        ops.get_default_graph().graph_def_versions.producer = 9
+        gen_nn_ops._fused_batch_norm(
+            conv_op,
+            gamma_op,
+            beta_op,
+            mean_op,
+            variance_op,
+            0.00001,
+            is_training=False,
+            data_format=data_format,
+            name="output")
+        original_graph_def = sess.graph_def
+        original_result = sess.run(["output:0"])
+      optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+          original_graph_def)
+
+      with self.test_session(use_gpu=use_gpu) as sess:
+        _ = importer.import_graph_def(
+            optimized_graph_def, input_map={}, name="optimized")
+        optimized_result = sess.run(["optimized/output:0"])
+
+      self.assertAllClose(
+          original_result, optimized_result, rtol=1e-04, atol=1e-06)
+
+      for node in optimized_graph_def.node:
+        self.assertNotEqual("FusedBatchNorm", node.op)
 
   def testFuseResizePadAndConv(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 21e8e803fcb3d12a2e41b5f9e2810742ec220be8..5b0a584c10ea33f345c09324cbd47eb1789466e0 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -31,6 +31,7 @@ import warnings
 
 import numpy as np
 
+from six import integer_types
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.example import example_pb2
@@ -440,7 +441,7 @@ def _create_example_string(example_dict):
     elif isinstance(feature_list[0], str):
       example.features.feature[feature_name].bytes_list.value.extend(
           feature_list)
-    elif isinstance(feature_list[0], (int, long)):
+    elif isinstance(feature_list[0], integer_types):
       example.features.feature[feature_name].int64_list.value.extend(
           feature_list)
     else:
diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 13c07cfd7bf4333fee3edc3c3ad9d2fb7bcbaad2..c08e3cca007dc17f1112d53bf729c1accf61b5df 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdadeltaOptimizer")
 class AdadeltaOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adadelta algorithm.
 
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index afa192f7cc6e0ecd629fd94252d26961f1407183..deb4e6f546379eff330235dbc302a30c44193830 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -25,8 +25,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradOptimizer")
 class AdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adagrad algorithm.
 
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index b3f9ea323c2bb4fd9ecee93863fbc7955b47a947..5ba403554f570d9df33a5d525a40de2eb0d11138 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradDAOptimizer")
 class AdagradDAOptimizer(optimizer.Optimizer):
   """Adagrad Dual Averaging algorithm for sparse linear models.
 
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 0c69f8bf3997452f0eeb71c93f4fcf98eb27d8f9..c92f6fc3015960a2b821651231bb94713e0d53dd 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -26,8 +26,10 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdamOptimizer")
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 52b0f4210612bad4a2e838153ac9cbdb1023bf66..7af821c81928e67e0f258bc064d582a4186995c1 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -18,8 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import errors
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.basic_train_loop")
 def basic_train_loop(supervisor, train_step_fn, args=None,
                      kwargs=None, master=""):
   """Basic loop to train a model.
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 752d585cd17e1b1a89abbae7c9e61fa966ad7f93..aae757b99aa9abb2fca112dcc781fc31e367649d 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -47,6 +47,7 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 from tensorflow.python.training.summary_io import SummaryWriterCache
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _HookTimer(object):
@@ -85,6 +86,7 @@ class _HookTimer(object):
     raise NotImplementedError
 
 
+@tf_export("train.SecondOrStepTimer")
 class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
@@ -164,6 +166,7 @@ class NeverTriggerTimer(_HookTimer):
     return None
 
 
+@tf_export("train.LoggingTensorHook")
 class LoggingTensorHook(session_run_hook.SessionRunHook):
   """Prints the given tensors every N local steps, every N seconds, or at end.
 
@@ -262,6 +265,7 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       self._log_tensors(values)
 
 
+@tf_export("train.StopAtStepHook")
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
 
@@ -317,6 +321,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.CheckpointSaverListener")
 class CheckpointSaverListener(object):
   """Interface for listeners that take action before or after checkpoint save.
 
@@ -331,7 +336,7 @@ class CheckpointSaverListener(object):
   `CheckpointSaverHook`, as in this example:
 
   ```python
-  class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+  class ExampleCheckpointSaverListener(CheckpointSaverListener):
     def begin(self):
       # You can add ops to the graph here.
       print('Starting the session.')
@@ -347,7 +352,7 @@ class CheckpointSaverListener(object):
       print('Done with the session.')
 
   ...
-  listener = ExampleCheckpointSaverListerner()
+  listener = ExampleCheckpointSaverListener()
   saver_hook = tf.train.CheckpointSaverHook(
       checkpoint_dir, listeners=[listener])
   with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
@@ -375,6 +380,7 @@ class CheckpointSaverListener(object):
     pass
 
 
+@tf_export("train.CheckpointSaverHook")
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -497,6 +503,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     return savers[0]
 
 
+@tf_export("train.StepCounterHook")
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Hook that counts steps per second."""
 
@@ -575,12 +582,14 @@ class StepCounterHook(session_run_hook.SessionRunHook):
     self._last_global_step = stale_global_step
 
 
+@tf_export("train.NanLossDuringTrainingError")
 class NanLossDuringTrainingError(RuntimeError):
 
   def __str__(self):
     return "NaN loss during training."
 
 
+@tf_export("train.NanTensorHook")
 class NanTensorHook(session_run_hook.SessionRunHook):
   """Monitors the loss tensor and stops training if loss is NaN.
 
@@ -612,6 +621,7 @@ class NanTensorHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.SummarySaverHook")
 class SummarySaverHook(session_run_hook.SessionRunHook):
   """Saves summaries every N steps."""
 
@@ -720,6 +730,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
     return summary_op
 
 
+@tf_export("train.GlobalStepWaiterHook")
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """Delays execution until global step reaches `wait_until_step`.
 
@@ -767,6 +778,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
       time.sleep(0.5)
 
 
+@tf_export("train.FinalOpsHook")
 class FinalOpsHook(session_run_hook.SessionRunHook):
   """A hook which evaluates `Tensors` at the end of a session."""
 
@@ -793,6 +805,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
                                            feed_dict=self._final_ops_feed_dict)
 
 
+@tf_export("train.FeedFnHook")
 class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
@@ -810,6 +823,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
+@tf_export("train.ProfilerHook")
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 5054873bc1c7751e6164a868b91b8ef7be0a5c79..fa3de6fad27b6cc773f9f2e86e9f95395eb7c285 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -36,6 +37,7 @@ __all__ = [
 ]
 
 
+@tf_export("train.load_checkpoint")
 def load_checkpoint(ckpt_dir_or_file):
   """Returns `CheckpointReader` for checkpoint found in `ckpt_dir_or_file`.
 
@@ -60,6 +62,7 @@ def load_checkpoint(ckpt_dir_or_file):
   return pywrap_tensorflow.NewCheckpointReader(filename)
 
 
+@tf_export("train.load_variable")
 def load_variable(ckpt_dir_or_file, name):
   """Returns the tensor value of the given variable in the checkpoint.
 
@@ -77,6 +80,7 @@ def load_variable(ckpt_dir_or_file, name):
   return reader.get_tensor(name)
 
 
+@tf_export("train.list_variables")
 def list_variables(ckpt_dir_or_file):
   """Returns list of all variables in the checkpoint.
 
@@ -95,6 +99,7 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
+@tf_export("train.init_from_checkpoint")
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Initializes current variables with tensors loaded from given checkpoint.
 
@@ -176,7 +181,8 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
   reader = load_checkpoint(ckpt_dir_or_file)
   variable_map = reader.get_variable_to_shape_map()
-  for tensor_name_in_ckpt, current_var_or_name in six.iteritems(assignment_map):
+  for tensor_name_in_ckpt, current_var_or_name in sorted(
+      six.iteritems(assignment_map)):
     var = None
     # Check if this is Variable object or list of Variable objects (in case of
     # partitioned variables).
@@ -233,7 +239,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
           if "/part_" in var_name:
             var_name = var_name[:var_name.index("/part_")]
           scope_variables.add(var_name)
-      for var_name in scope_variables:
+      for var_name in sorted(scope_variables):
         # Lookup name with specified prefix and suffix from current variable.
         # If tensor_name given is '/' (root), don't use it for full name.
         full_tensor_name = var_name[len(scopes):]
@@ -241,6 +247,9 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
           full_tensor_name = full_tensor_name[1:]
         if tensor_name_in_ckpt != "/":
           full_tensor_name = tensor_name_in_ckpt + full_tensor_name
+        # Remove trailing '/', if any, in the full_tensor_name
+        if full_tensor_name.endswith("/"):
+          full_tensor_name = full_tensor_name[:-1]
         if full_tensor_name not in variable_map:
           raise ValueError(
               "Tensor %s (%s in %s) is not found in %s checkpoint" % (
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 0e31255b74f64657cffc4a2f58798835513f0444..0ff97d85e37e6167f1200ba56940f4a663c259a2 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -27,8 +27,10 @@ import six
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Coordinator")
 class Coordinator(object):
   """A coordinator for threads.
 
@@ -406,6 +408,7 @@ class Coordinator(object):
 
 
 # Threads for the standard services.
+@tf_export("train.LooperThread")
 class LooperThread(threading.Thread):
   """A thread that runs code repeatedly, optionally on a timer.
 
diff --git a/tensorflow/python/training/coordinator_test.py b/tensorflow/python/training/coordinator_test.py
index 149d3eed414d53f46dcab403b7b4822ffa66e644..3e4ac1dfff9708fd1a5cd8bdf23f99d8f963bd16 100644
--- a/tensorflow/python/training/coordinator_test.py
+++ b/tensorflow/python/training/coordinator_test.py
@@ -85,8 +85,8 @@ class CoordinatorTest(test.TestCase):
     self.assertFalse(coord.wait_for_stop(0.1))
     wait_for_stop_ev = threading.Event()
     has_stopped_ev = threading.Event()
-    t = threading.Thread(target=StopOnEvent,
-                         args=(coord, wait_for_stop_ev, has_stopped_ev))
+    t = threading.Thread(
+        target=StopOnEvent, args=(coord, wait_for_stop_ev, has_stopped_ev))
     t.start()
     self.assertFalse(coord.should_stop())
     self.assertFalse(coord.wait_for_stop(0.01))
@@ -100,7 +100,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01,)),
         threading.Thread(target=SleepABit, args=(0.02,)),
-        threading.Thread(target=SleepABit, args=(0.01,))]
+        threading.Thread(target=SleepABit, args=(0.01,))
+    ]
     for t in threads:
       t.start()
     coord.join(threads)
@@ -112,7 +113,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01, coord)),
         threading.Thread(target=SleepABit, args=(0.02, coord)),
-        threading.Thread(target=SleepABit, args=(0.01, coord))]
+        threading.Thread(target=SleepABit, args=(0.01, coord))
+    ]
     for t in threads:
       t.start()
     WaitForThreadsToRegister(coord, 3)
@@ -125,7 +127,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01, coord)),
         threading.Thread(target=SleepABit, args=(0.02,)),
-        threading.Thread(target=SleepABit, args=(0.01, coord))]
+        threading.Thread(target=SleepABit, args=(0.01, coord))
+    ]
     for t in threads:
       t.start()
     WaitForThreadsToRegister(coord, 2)
@@ -135,14 +138,17 @@ class CoordinatorTest(test.TestCase):
       self.assertFalse(t.is_alive())
 
   def testJoinGraceExpires(self):
+
     def TestWithGracePeriod(stop_grace_period):
       coord = coordinator.Coordinator()
       wait_for_stop_ev = threading.Event()
       has_stopped_ev = threading.Event()
       threads = [
-          threading.Thread(target=StopOnEvent,
-                           args=(coord, wait_for_stop_ev, has_stopped_ev)),
-          threading.Thread(target=SleepABit, args=(10.0,))]
+          threading.Thread(
+              target=StopOnEvent,
+              args=(coord, wait_for_stop_ev, has_stopped_ev)),
+          threading.Thread(target=SleepABit, args=(10.0,))
+      ]
       for t in threads:
         t.daemon = True
         t.start()
@@ -150,6 +156,7 @@ class CoordinatorTest(test.TestCase):
       has_stopped_ev.wait()
       with self.assertRaisesRegexp(RuntimeError, "threads still running"):
         coord.join(threads, stop_grace_period_secs=stop_grace_period)
+
     TestWithGracePeriod(1e-10)
     TestWithGracePeriod(0.002)
     TestWithGracePeriod(1.0)
@@ -159,16 +166,16 @@ class CoordinatorTest(test.TestCase):
     wait_for_stop_ev = threading.Event()
     has_stopped_ev = threading.Event()
     threads = [
-        threading.Thread(target=StopOnEvent,
-                         args=(coord, wait_for_stop_ev, has_stopped_ev)),
-        threading.Thread(target=SleepABit, args=(10.0,))]
+        threading.Thread(
+            target=StopOnEvent, args=(coord, wait_for_stop_ev, has_stopped_ev)),
+        threading.Thread(target=SleepABit, args=(10.0,))
+    ]
     for t in threads:
       t.daemon = True
       t.start()
     wait_for_stop_ev.set()
     has_stopped_ev.wait()
-    coord.join(
-        threads, stop_grace_period_secs=1., ignore_live_threads=True)
+    coord.join(threads, stop_grace_period_secs=1., ignore_live_threads=True)
 
   def testJoinRaiseReportExcInfo(self):
     coord = coordinator.Coordinator()
@@ -180,7 +187,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"), False)),
         threading.Thread(
             target=RaiseOnEvent,
-            args=(coord, ev_2, None, RuntimeError("Too late"), False))]
+            args=(coord, ev_2, None, RuntimeError("Too late"), False))
+    ]
     for t in threads:
       t.start()
 
@@ -199,7 +207,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"), True)),
         threading.Thread(
             target=RaiseOnEvent,
-            args=(coord, ev_2, None, RuntimeError("Too late"), True))]
+            args=(coord, ev_2, None, RuntimeError("Too late"), True))
+    ]
     for t in threads:
       t.start()
 
@@ -214,9 +223,8 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None,
-                  errors_impl.OutOfRangeError(None, None, "First"),
-                  True))
-        ]
+                  errors_impl.OutOfRangeError(None, None, "First"), True))
+    ]
     for t in threads:
       t.start()
 
@@ -230,7 +238,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, ValueError("Clean stop"), True))
-        ]
+    ]
     for t in threads:
       t.start()
 
@@ -247,7 +255,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"))),
         threading.Thread(
             target=RaiseOnEventUsingContextHandler,
-            args=(coord, ev_2, None, RuntimeError("Too late")))]
+            args=(coord, ev_2, None, RuntimeError("Too late")))
+    ]
     for t in threads:
       t.start()
 
@@ -262,7 +271,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, RuntimeError("First"), True)),
-        ]
+    ]
     for t in threads:
       t.start()
 
@@ -274,7 +283,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, RuntimeError("Second"), True)),
-        ]
+    ]
     for t in threads:
       t.start()
     with self.assertRaisesRegexp(RuntimeError, "Second"):
@@ -337,24 +346,29 @@ class LooperTest(test.TestCase):
   def testTargetArgs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        args=(coord, n))
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, args=(coord, n))
     coord.join([thread])
     self.assertEqual(0, n[0])
 
   def testTargetKwargs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        kwargs={"coord": coord, "n": n})
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, kwargs={
+            "coord": coord,
+            "n": n
+        })
     coord.join([thread])
     self.assertEqual(0, n[0])
 
   def testTargetMixedArgs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        args=(coord,), kwargs={"n": n})
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, args=(coord,), kwargs={
+            "n": n
+        })
     coord.join([thread])
     self.assertEqual(0, n[0])
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 37ab625779f788b1b8e270a15db3244ea6f1bef3..689088bb41edfd94a1d483ed2b5f7447e9e060e7 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -23,6 +23,7 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _RoundRobinStrategy(object):
@@ -121,6 +122,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
+@tf_export("train.replica_device_setter")
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index c64a1b3f799e776c7bbbbcfb691bdd97e4a34466..9d02e694db15637126f37ee5575638908b351def 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.FtrlOptimizer")
 class FtrlOptimizer(optimizer.Optimizer):
   """Optimizer that implements the FTRL algorithm.
 
@@ -265,4 +267,3 @@ class FtrlOptimizer(optimizer.Optimizer):
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
-
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 5a536e27297f054671e7e44a9e5d20a8b36580b7..380e14e02497fbe3681d6bae03fe9c636c5d13aa 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.GradientDescentOptimizer")
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
   """
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 331a51e8bc848917967fed06632fe0d1c5bcad9c..bd9985a7c5c181c0431e0c0a91186bc36b11c787 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
@@ -53,9 +54,12 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 # pylint: enable=protected-access
 
 
+@tf_export("train.match_filenames_once")
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
 
+  NOTE: The order of the files returned can be non-deterministic.
+
   Args:
     pattern: A file pattern (glob), or 1D tensor of file patterns.
     name: A name for the operations (optional).
@@ -70,6 +74,7 @@ def match_filenames_once(pattern, name=None):
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
 
+@tf_export("train.limit_epochs")
 def limit_epochs(tensor, num_epochs=None, name=None):
   """Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
 
@@ -102,6 +107,7 @@ def limit_epochs(tensor, num_epochs=None, name=None):
       return array_ops.identity(tensor, name=name)
 
 
+@tf_export("train.input_producer")
 def input_producer(input_tensor,
                    element_shape=None,
                    num_epochs=None,
@@ -184,6 +190,7 @@ def input_producer(input_tensor,
     return q
 
 
+@tf_export("train.string_input_producer")
 def string_input_producer(string_tensor,
                           num_epochs=None,
                           shuffle=True,
@@ -253,6 +260,7 @@ def string_input_producer(string_tensor,
         cancel_op=cancel_op)
 
 
+@tf_export("train.range_input_producer")
 def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces the integers from 0 to limit-1 in a queue.
@@ -290,6 +298,7 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
         shared_name, "fraction_of_%d_full" % capacity, name)
 
 
+@tf_export("train.slice_input_producer")
 def slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces a slice of each `Tensor` in `tensor_list`.
@@ -885,6 +894,7 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
 # Batching functions ----------------------------------------------------------
 
 
+@tf_export("train.batch")
 def batch(tensors, batch_size, num_threads=1, capacity=32,
           enqueue_many=False, shapes=None, dynamic_pad=False,
           allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -979,6 +989,7 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.maybe_batch")
 def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
                 enqueue_many=False, shapes=None, dynamic_pad=False,
                 allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1031,6 +1042,7 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.batch_join")
 def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
                shapes=None, dynamic_pad=False, allow_smaller_final_batch=False,
                shared_name=None, name=None):
@@ -1136,6 +1148,7 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
       name=name)
 
 
+@tf_export("train.maybe_batch_join")
 def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
                      enqueue_many=False, shapes=None, dynamic_pad=False,
                      allow_smaller_final_batch=False, shared_name=None,
@@ -1188,6 +1201,7 @@ def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
       name=name)
 
 
+@tf_export("train.shuffle_batch")
 def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                   num_threads=1, seed=None, enqueue_many=False, shapes=None,
                   allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1287,6 +1301,7 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch")
 def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                         keep_input, num_threads=1, seed=None,
                         enqueue_many=False, shapes=None,
@@ -1346,6 +1361,7 @@ def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.shuffle_batch_join")
 def shuffle_batch_join(tensors_list, batch_size, capacity,
                        min_after_dequeue, seed=None, enqueue_many=False,
                        shapes=None, allow_smaller_final_batch=False,
@@ -1439,6 +1455,7 @@ def shuffle_batch_join(tensors_list, batch_size, capacity,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch_join")
 def maybe_shuffle_batch_join(tensors_list, batch_size, capacity,
                              min_after_dequeue, keep_input, seed=None,
                              enqueue_many=False, shapes=None,
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 3ee49650e01bd31d7d34fe1e109599531626058c..10ab4c1137ff226d88902143d4f2281ad77de531 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Various learning rate decay functions."""
 from __future__ import absolute_import
 from __future__ import division
@@ -26,10 +25,16 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
-def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+@tf_export("train.exponential_decay")
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies exponential decay to the learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -85,9 +90,9 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
   """
   if global_step is None:
     raise ValueError("global_step is required for exponential_decay.")
-  with ops.name_scope(name, "ExponentialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, decay_rate]) as name:
+  with ops.name_scope(
+      name, "ExponentialDecay",
+      [learning_rate, global_step, decay_steps, decay_rate]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -96,10 +101,11 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
     p = global_step / decay_steps
     if staircase:
       p = math_ops.floor(p)
-    return math_ops.multiply(learning_rate, math_ops.pow(decay_rate, p),
-                             name=name)
+    return math_ops.multiply(
+        learning_rate, math_ops.pow(decay_rate, p), name=name)
 
 
+@tf_export("train.piecewise_constant")
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
@@ -156,15 +162,15 @@ def piecewise_constant(x, boundaries, values, name=None):
           boundaries[i] = b
         else:
           raise ValueError(
-              "Boundaries (%s) must have the same dtype as x (%s)." % (
-                  b.dtype.base_dtype, x.dtype.base_dtype))
+              "Boundaries (%s) must have the same dtype as x (%s)." %
+              (b.dtype.base_dtype, x.dtype.base_dtype))
     # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing.
     values = ops.convert_n_to_tensor(values)
     for v in values[1:]:
       if v.dtype.base_dtype != values[0].dtype.base_dtype:
         raise ValueError(
-            "Values must have elements all with the same dtype (%s vs %s)." % (
-                values[0].dtype.base_dtype, v.dtype.base_dtype))
+            "Values must have elements all with the same dtype (%s vs %s)." %
+            (values[0].dtype.base_dtype, v.dtype.base_dtype))
     pred_fn_pairs = []
     pred_fn_pairs.append((x <= boundaries[0], lambda: values[0]))
     pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1]))
@@ -179,9 +185,14 @@ def piecewise_constant(x, boundaries, values, name=None):
     return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
 
 
-def polynomial_decay(learning_rate, global_step, decay_steps,
-                     end_learning_rate=0.0001, power=1.0,
-                     cycle=False, name=None):
+@tf_export("train.polynomial_decay")
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False,
+                     name=None):
   """Applies a polynomial decay to the learning rate.
 
   It is commonly observed that a monotonically decreasing learning rate, whose
@@ -255,9 +266,10 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
   """
   if global_step is None:
     raise ValueError("global_step is required for polynomial_decay.")
-  with ops.name_scope(name, "PolynomialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, end_learning_rate, power]) as name:
+  with ops.name_scope(
+      name, "PolynomialDecay",
+      [learning_rate, global_step, decay_steps, end_learning_rate, power
+      ]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -267,23 +279,29 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
     if cycle:
       # Find the first multiple of decay_steps that is bigger than global_step.
       # If global_step is zero set the multiplier to 1
-      multiplier = control_flow_ops.cond(math_ops.equal(global_step, 0),
-                                         lambda: 1.0,
-                                         lambda: math_ops.ceil(
-                                             global_step / decay_steps))
+      multiplier = control_flow_ops.cond(
+          math_ops.equal(global_step, 0), lambda: 1.0,
+          lambda: math_ops.ceil(global_step / decay_steps))
       decay_steps = math_ops.multiply(decay_steps, multiplier)
     else:
       # Make sure that the global_step used is not bigger than decay_steps.
       global_step = math_ops.minimum(global_step, decay_steps)
 
     p = math_ops.div(global_step, decay_steps)
-    return math_ops.add(math_ops.multiply(learning_rate - end_learning_rate,
-                                          math_ops.pow(1 - p, power)),
-                        end_learning_rate, name=name)
-
-
-def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+    return math_ops.add(
+        math_ops.multiply(learning_rate - end_learning_rate,
+                          math_ops.pow(1 - p, power)),
+        end_learning_rate,
+        name=name)
+
+
+@tf_export("train.natural_exp_decay")
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies natural exponential decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -349,8 +367,13 @@ def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.multiply(learning_rate, exponent, name=name)
 
 
-def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
-                       staircase=False, name=None):
+@tf_export("train.inverse_time_decay")
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False,
+                       name=None):
   """Applies inverse time decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -362,13 +385,15 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
+  decay_step)
   ```
 
   or, if `staircase` is `True`, as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
+  decay_step))
   ```
 
   Example: decay 1/t with a rate of 0.5:
@@ -379,7 +404,8 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   learning_rate = 0.1
   decay_steps = 1.0
   decay_rate = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate)
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step,
+  decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
@@ -424,8 +450,8 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.div(learning_rate, denom, name=name)
 
 
-def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
-                 name=None):
+@tf_export("train.cosine_decay")
+def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   """Applies cosine decay to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
@@ -484,8 +510,14 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
     return math_ops.multiply(learning_rate, decayed)
 
 
-def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
-                          t_mul=2.0, m_mul=1.0, alpha=0.0, name=None):
+@tf_export("train.cosine_decay_restarts")
+def cosine_decay_restarts(learning_rate,
+                          global_step,
+                          first_decay_steps,
+                          t_mul=2.0,
+                          m_mul=1.0,
+                          alpha=0.0,
+                          name=None):
   """Applies cosine decay with restarts to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
@@ -532,10 +564,9 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
   """
   if global_step is None:
     raise ValueError("cosine decay restarts requires global_step")
-  with ops.name_scope(name, "SGDRDecay",
-                      [learning_rate, global_step]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate,
-                                          name="initial_learning_rate")
+  with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name:
+    learning_rate = ops.convert_to_tensor(
+        learning_rate, name="initial_learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
     first_decay_steps = math_ops.cast(first_decay_steps, dtype)
@@ -547,11 +578,12 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
 
     def compute_step(completed_fraction, geometric=False):
       if geometric:
-        i_restart = math_ops.floor(math_ops.log(1.0 - completed_fraction * (
-            1.0 - t_mul)) / math_ops.log(t_mul))
+        i_restart = math_ops.floor(
+            math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
+            math_ops.log(t_mul))
 
-        sum_r = (1.0 - t_mul ** i_restart) / (1.0 - t_mul)
-        completed_fraction = (completed_fraction - sum_r) / t_mul ** i_restart
+        sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+        completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
 
       else:
         i_restart = math_ops.floor(completed_fraction)
@@ -564,16 +596,21 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
         lambda: compute_step(completed_fraction, geometric=False),
         lambda: compute_step(completed_fraction, geometric=True))
 
-    m_fac = m_mul ** i_restart
-    cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
-        constant_op.constant(math.pi) * completed_fraction))
+    m_fac = m_mul**i_restart
+    cosine_decayed = 0.5 * m_fac * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
     decayed = (1 - alpha) * cosine_decayed + alpha
 
   return math_ops.multiply(learning_rate, decayed, name=name)
 
 
-def linear_cosine_decay(learning_rate, global_step, decay_steps,
-                        num_periods=0.5, alpha=0.0, beta=0.001,
+@tf_export("train.linear_cosine_decay")
+def linear_cosine_decay(learning_rate,
+                        global_step,
+                        decay_steps,
+                        num_periods=0.5,
+                        alpha=0.0,
+                        beta=0.001,
                         name=None):
   """Applies linear cosine decay to the learning rate.
 
@@ -651,9 +688,15 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps,
     return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
 
 
-def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
-                              initial_variance=1.0, variance_decay=0.55,
-                              num_periods=0.5, alpha=0.0, beta=0.001,
+@tf_export("train.noisy_linear_cosine_decay")
+def noisy_linear_cosine_decay(learning_rate,
+                              global_step,
+                              decay_steps,
+                              initial_variance=1.0,
+                              variance_decay=0.55,
+                              num_periods=0.5,
+                              alpha=0.0,
+                              beta=0.001,
                               name=None):
   """Applies noisy linear cosine decay to the learning rate.
 
@@ -734,8 +777,8 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
         math_ops.pow(1.0 + global_step, variance_decay))
     std = math_ops.sqrt(variance)
     noisy_linear_decayed = (
-        linear_decayed + random_ops.random_normal(
-            linear_decayed.shape, stddev=std))
+        linear_decayed +
+        random_ops.random_normal(linear_decayed.shape, stddev=std))
 
     completed_fraction = global_step / decay_steps
     fraction = 2.0 * num_periods * completed_fraction
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index cf9530d87c46783b517884610b644b076bef6807..bd9fa79d8feac68c149f787ee8501bdddb173d33 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.MomentumOptimizer")
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index fa3517db27be4581deb85f77f022406b8b30ec56..6c5c9e01a76d539b550420134b09090b89beed46 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -41,6 +41,7 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # The list of exceptions that we should recover from. Exceptions not in this
@@ -52,6 +53,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
+@tf_export('train.Scaffold')
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -272,6 +274,7 @@ class Scaffold(object):
         resources.initialize_resources(resources.local_resources()))
 
 
+@tf_export('train.MonitoredTrainingSession')
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
                              checkpoint_dir=None,
@@ -381,6 +384,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                           stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SessionCreator')
 class SessionCreator(object):
   """A factory for tf.Session."""
 
@@ -390,6 +394,7 @@ class SessionCreator(object):
         'create_session is not implemented for {}.'.format(self))
 
 
+@tf_export('train.ChiefSessionCreator')
 class ChiefSessionCreator(SessionCreator):
   """Creates a tf.Session for a chief."""
 
@@ -441,6 +446,7 @@ class ChiefSessionCreator(SessionCreator):
         init_fn=self._scaffold.init_fn)
 
 
+@tf_export('train.WorkerSessionCreator')
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
@@ -706,6 +712,7 @@ class _MonitoredSession(object):
     return self._coordinated_creator.tf_sess
 
 
+@tf_export('train.MonitoredSession')
 class MonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, recovery and hooks.
 
@@ -788,6 +795,7 @@ class MonitoredSession(_MonitoredSession):
         stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SingularMonitoredSession')
 class SingularMonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, restoring, and hooks.
 
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index e34c759e894c86a103f0228163f7bae2ffc7fb61..2d89082ad75ff8b39575711bdbbc3f454f99a70d 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(touts): switch to variables.Variable.
@@ -187,7 +188,7 @@ def _zero_debias(unbiased_var, value, decay):
   with variable_scope.variable_scope(
       unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
     with ops.colocate_with(unbiased_var):
-      with ops.control_dependencies(None):
+      with ops.init_scope():
         biased_initializer = init_ops.zeros_initializer(
             dtype=unbiased_var.dtype)(unbiased_var.get_shape())
         local_step_initializer = init_ops.zeros_initializer()
@@ -230,6 +231,7 @@ def _zero_debias(unbiased_var, value, decay):
       return unbiased_ema_delta
 
 
+@tf_export("train.ExponentialMovingAverage")
 class ExponentialMovingAverage(object):
   """Maintains moving averages of variables by employing an exponential decay.
 
@@ -385,7 +387,7 @@ class ExponentialMovingAverage(object):
       # For variables: to lower communication bandwidth across devices we keep
       # the moving averages on the same device as the variables. For other
       # tensors, we rely on the existing device allocation mechanism.
-      with ops.control_dependencies(None):
+      with ops.init_scope():
         if isinstance(var, variables.Variable):
           avg = slot_creator.create_slot(var,
                                          var.initialized_value(),
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 038469b1bac9d2fabce788340278ea165f2f9249..425dbd831367ca1d450bb23dc6ab56feb147c0b6 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _get_variable_for(v):
@@ -187,6 +188,7 @@ def _get_processor(v):
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
+@tf_export("train.Optimizer")
 class Optimizer(object):
   """Base class for optimizers.
 
@@ -514,7 +516,7 @@ class Optimizer(object):
     if not var_list:
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, _, v in converted_grads_and_vars],))
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       self._create_slots([_get_variable_for(v) for v in var_list])
     update_ops = []
     with ops.name_scope(name, self._name) as name:
@@ -533,7 +535,15 @@ class Optimizer(object):
       else:
         with ops.control_dependencies([self._finish(update_ops, "update")]):
           with ops.colocate_with(global_step):
-            apply_updates = state_ops.assign_add(global_step, 1, name=name)
+            if isinstance(global_step, resource_variable_ops.ResourceVariable):
+              # TODO(apassos): the implicit read in assign_add is slow; consider
+              # making it less so.
+              apply_updates = resource_variable_ops.assign_add_variable_op(
+                  global_step.handle,
+                  ops.convert_to_tensor(1, dtype=global_step.dtype),
+                  name=name)
+            else:
+              apply_updates = state_ops.assign_add(global_step, 1, name=name)
 
       if context.in_graph_mode():
         if isinstance(apply_updates, ops.Tensor):
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index da31ab325d5e45e1943f554c45717cceb4dc638f..9bd677b8efcd447f74ec2a3cbe94d63eeb9a4dd1 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -23,8 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalAdagradOptimizer")
 class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 53e9dc2ef2c86a20070fdbdc690b39d2c0e9df06..369b6cbb50e5c621737c095a24eeb473f3870534 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -24,8 +24,10 @@ from tensorflow.python.ops import math_ops
 # pylint: enable=unused-import
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalGradientDescentOptimizer")
 class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index 4e7c81d7b2913d71a23dcaa3751db2aaffdc67cf..07afba79abf4d636c9ec2d53bcf2641594a35733 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -27,8 +27,10 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.queue_runner.QueueRunner", "train.QueueRunner")
 class QueueRunner(object):
   """Holds a list of enqueue operations for a queue, each to be run in a thread.
 
@@ -384,6 +386,7 @@ class QueueRunner(object):
                        import_scope=import_scope)
 
 
+@tf_export("train.queue_runner.add_queue_runner", "train.add_queue_runner")
 def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Adds a `QueueRunner` to a collection in the graph.
 
@@ -402,6 +405,8 @@ def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   ops.add_to_collection(collection, qr)
 
 
+@tf_export("train.queue_runner.start_queue_runners",
+           "train.start_queue_runners")
 def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
                         collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Starts all queue runners collected in the graph.
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index ebec725b7b98e9a078f5558af85355988e8aca67..89d1099a49fedf2cd2ae372cb9c5f7422d43acc2 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """One-line documentation for rmsprop module.
 
 rmsprop algorithm [tieleman2012rmsprop]
@@ -47,12 +46,15 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.RMSPropOptimizer")
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
-  See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  See the
+  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
   """
 
   def __init__(self,
@@ -113,13 +115,12 @@ class RMSPropOptimizer(optimizer.Optimizer):
       self._zeros_slot(v, "momentum", self._name)
 
   def _prepare(self):
-    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
-                                                       name="learning_rate")
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        self._learning_rate, name="learning_rate")
     self._decay_tensor = ops.convert_to_tensor(self._decay, name="decay")
-    self._momentum_tensor = ops.convert_to_tensor(self._momentum,
-                                                  name="momentum")
-    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon,
-                                                 name="epsilon")
+    self._momentum_tensor = ops.convert_to_tensor(
+        self._momentum, name="momentum")
+    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon, name="epsilon")
 
   def _apply_dense(self, grad, var):
     rms = self.get_slot(var, "rms")
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 2c59b82ebe2264e56da1a3b977b27eba2ed6f494..764f8400122118b6abcbad25ce0555954e38d29d 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -53,6 +53,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Op names which identify variable reads which should be saved.
@@ -889,6 +890,7 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
+@tf_export("train.generate_checkpoint_state_proto")
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
                                     all_model_checkpoint_paths=None):
@@ -933,6 +935,7 @@ def generate_checkpoint_state_proto(save_dir,
   return coord_checkpoint_proto
 
 
+@tf_export("train.update_checkpoint_state")
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
@@ -1025,6 +1028,7 @@ def _update_checkpoint_state(save_dir,
                                       text_format.MessageToString(ckpt))
 
 
+@tf_export("train.get_checkpoint_state")
 def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   """Returns CheckpointState proto from the "checkpoint" file.
 
@@ -1082,6 +1086,7 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   return ckpt
 
 
+@tf_export("train.Saver")
 class Saver(object):
   """Saves and restores variables.
 
@@ -1229,7 +1234,7 @@ class Saver(object):
         The `saver_def` proto should be the one returned by the
         `as_saver_def()` call of the `Saver` that was created for that `Graph`.
       builder: Optional `SaverBuilder` to use if a `saver_def` was not provided.
-        Defaults to `BaseSaverBuilder()`.
+        Defaults to `BulkSaverBuilder()`.
       defer_build: If `True`, defer adding the save and restore ops to the
         `build()` call. In that case `build()` should be called before
         finalizing the graph or using the saver.
@@ -1309,7 +1314,7 @@ class Saver(object):
 
     if not self.saver_def or context.in_eager_mode():
       if self._builder is None:
-        self._builder = BaseSaverBuilder(self._write_version)
+        self._builder = BulkSaverBuilder(self._write_version)
 
       if self._var_list is None:
         # pylint: disable=protected-access
@@ -1592,9 +1597,9 @@ class Saver(object):
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
-      A string: path prefix used for the checkpoint files.  If the saver is
-        sharded, this string ends with: '-?????-of-nnnnn' where 'nnnnn'
-        is the number of shards created.
+      A string: path prefix used for the checkpoint files. If checkpoint
+        format is V1 and the saver is sharded, this string ends with:
+         '-?????-of-nnnnn' where 'nnnnn' is the number of shards created.
       If the saver is empty, returns None.
 
     Raises:
@@ -1744,6 +1749,11 @@ class Saver(object):
       return
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
+    if (os.path.isfile(save_path) and
+        self._write_version != saver_pb2.SaverDef.V1):
+      raise ValueError("The specified path: %s is a file."
+                       " Please specify only the path prefix"
+                       " to the checkpoint files." % save_path)
     logging.info("Restoring parameters from %s", save_path)
     if context.in_graph_mode():
       sess.run(self.saver_def.restore_op_name,
@@ -1783,6 +1793,7 @@ def _prefix_to_checkpoint_path(prefix, format_version):
   return prefix  # Just the data file.
 
 
+@tf_export("train.latest_checkpoint")
 def latest_checkpoint(checkpoint_dir, latest_filename=None):
   """Finds the filename of latest saved checkpoint file.
 
@@ -1812,6 +1823,7 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
+@tf_export("train.import_meta_graph")
 def import_meta_graph(meta_graph_or_file, clear_devices=False,
                       import_scope=None, **kwargs):
   """Recreates a Graph saved in a `MetaGraphDef` proto.
@@ -1913,6 +1925,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
       return None
 
 
+@tf_export("train.export_meta_graph")
 def export_meta_graph(filename=None,
                       meta_info_def=None,
                       graph_def=None,
@@ -1989,6 +2002,7 @@ def export_meta_graph(filename=None,
   return meta_graph_def
 
 
+@tf_export("train.checkpoint_exists")
 def checkpoint_exists(checkpoint_prefix):
   """Checks whether a V1 or V2 checkpoint exists with the specified prefix.
 
@@ -2013,6 +2027,7 @@ def checkpoint_exists(checkpoint_prefix):
     return False
 
 
+@tf_export("train.get_checkpoint_mtimes")
 def get_checkpoint_mtimes(checkpoint_prefixes):
   """Returns the mtimes (modification timestamps) of the checkpoints.
 
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 29da67a30a58c1b8b8e172b2ccede340880fef58..2f421d1cc0a0190670082fabf4e25470c6a1723b 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -23,6 +23,7 @@ from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
@@ -92,6 +93,7 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
   return server_def
 
 
+@tf_export("train.Server")
 class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
@@ -221,6 +223,7 @@ class Server(object):
                   start=start)
 
 
+@tf_export("train.ClusterSpec")
 class ClusterSpec(object):
   """Represents a cluster as a set of "tasks", organized into "jobs".
 
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index b396a1e7d0a06ec7b952ba2980e081e01e681d4d..360e02fb44c1062f71bb50449b9ef381510a9c69 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _maybe_name(obj):
@@ -44,6 +45,7 @@ def _maybe_name(obj):
     return "<no name for %s>" % type(obj)
 
 
+@tf_export("train.SessionManager")
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 5b023d8a2672af5d1fab1c2566b19fca738fd1f7..89f40300650f3b6cd1ae15d946640c9df91771e2 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -96,8 +96,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.SessionRunHook")
 class SessionRunHook(object):
   """Hook to extend calls to MonitoredSession.run()."""
 
@@ -189,6 +191,7 @@ class SessionRunHook(object):
     pass
 
 
+@tf_export("train.SessionRunArgs")
 class SessionRunArgs(
     collections.namedtuple("SessionRunArgs",
                            ["fetches", "feed_dict", "options"])):
@@ -213,6 +216,7 @@ class SessionRunArgs(
     return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options)
 
 
+@tf_export("train.SessionRunContext")
 class SessionRunContext(object):
   """Provides information about the `session.run()` call being made.
 
@@ -264,6 +268,7 @@ class SessionRunContext(object):
     self._stop_requested = True
 
 
+@tf_export("train.SessionRunValues")
 class SessionRunValues(
     collections.namedtuple("SessionRunValues",
                            ["results", "options", "run_metadata"])):
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index e4514aaea223b6b254a7a72e11e6b70b576fd54b..d2ad34773e0615256c340826dcc312cc8a00dc23 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -37,8 +37,10 @@ from tensorflow.python.training import saver as saver_mod
 from tensorflow.python.training import session_manager as session_manager_mod
 from tensorflow.python.training import training_util
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Supervisor")
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 47702fdad05d13015e0cbf7768129b0c53b6c14c..0c6cf910d1a01dc20b15fb1cd5dbb249fbb60ef5 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import session_manager
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Please note that the gradients from replicas are averaged instead of summed
@@ -38,6 +39,7 @@ from tensorflow.python.training import session_run_hook
 # rate according to the number of replicas. This change is introduced to be
 # consistent with how gradients are aggregated (averaged) within a batch in a
 # replica.
+@tf_export("train.SyncReplicasOptimizer")
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 89a9e129328fe38da2ce497a7f26dc11446ea032..499f1feb2dbf8aee26314a43b0a000fb91a1c686 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Picked a long key value to minimize the chance of collision with user defined
@@ -40,6 +41,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 write_graph = graph_io.write_graph
 
 
+@tf_export('train.global_step')
 def global_step(sess, global_step_tensor):
   """Small helper to get the global step.
 
@@ -67,6 +69,7 @@ def global_step(sess, global_step_tensor):
   return int(sess.run(global_step_tensor))
 
 
+@tf_export('train.get_global_step')
 def get_global_step(graph=None):
   """Get the global step tensor.
 
@@ -101,6 +104,7 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.create_global_step')
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -139,6 +143,7 @@ def create_global_step(graph=None):
                      ops.GraphKeys.GLOBAL_STEP])
 
 
+@tf_export('train.get_or_create_global_step')
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -156,6 +161,7 @@ def get_or_create_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.assert_global_step')
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 270d96a3c7c831d8c06dd86199cf2dc5dfc43421..7e5f192b8f1ae5c86e463c7560553f2bcfd15995 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -41,8 +41,10 @@ import numpy as _np
 import six as _six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -65,6 +67,7 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
+@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,6 +96,7 @@ else:
   as_str = as_text
 
 
+@tf_export('compat.as_str_any')
 def as_str_any(value):
   """Converts to `str` as `str(value)`, but use `as_str` for `bytes`.
 
@@ -125,11 +129,16 @@ def path_to_str(path):
 # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we
 # need to check them specifically.  The same goes from Real and Complex.
 integral_types = (_numbers.Integral, _np.integer)
+tf_export('compat.integral_types').export_constant(__name__, 'integral_types')
 real_types = (_numbers.Real, _np.integer, _np.floating)
+tf_export('compat.real_types').export_constant(__name__, 'real_types')
 complex_types = (_numbers.Complex, _np.number)
+tf_export('compat.complex_types').export_constant(__name__, 'complex_types')
 
 # Either bytes or text.
 bytes_or_text_types = (bytes, _six.text_type)
+tf_export('compat.bytes_or_text_types').export_constant(__name__,
+                                                        'bytes_or_text_types')
 
 _allowed_symbols = [
     'as_str',
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 8a66f0435a8cb3d689a6613e2fca5bab1c0a37e3..fbec8fd2d8edfcd503a3677583a0853872ec08bc 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import re
 
+from tensorflow.python.eager import context
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
@@ -38,13 +39,14 @@ _PRINTED_WARNING = {}
 
 def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
   """Adds a deprecation notice to a docstring for deprecated functions."""
+  main_text = ['THIS FUNCTION IS DEPRECATED. It will be removed %s.' %
+               ('in a future version' if date is None else ('after %s' % date))]
+  if instructions:
+    main_text.append('Instructions for updating:')
   return decorator_utils.add_notice_to_docstring(
       doc, instructions,
       'DEPRECATED FUNCTION',
-      '(deprecated)', [
-          'THIS FUNCTION IS DEPRECATED. It will be removed %s.' % (
-              'in a future version' if date is None else ('after %s' % date)),
-          'Instructions for updating:'])
+      '(deprecated)', main_text)
 
 
 def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
@@ -66,23 +68,135 @@ def _validate_deprecation_args(date, instructions):
     raise ValueError('Don\'t deprecate things without conversion instructions!')
 
 
-def _call_location():
+def _call_location(outer=False):
   """Returns call location given level up from current call."""
   frame = tf_inspect.currentframe()
   if frame:
     # CPython internals are available, use them for performance.
     # walk back two frames to get to deprecated function caller.
-    first_frame = frame.f_back
-    second_frame = first_frame.f_back
-    frame = second_frame if second_frame else first_frame
+    frame = frame.f_back
+    if frame.f_back:
+      frame = frame.f_back
+    if outer and frame.f_back:
+      frame = frame.f_back
     return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
   else:
     # Slow fallback path
     stack = tf_inspect.stack(0)  # 0 avoids generating unused context
-    entry = stack[2]
+    entry = stack[3 if outer else 2]
     return '%s:%d' % (entry[1], entry[2])
 
 
+def deprecated_alias(deprecated_name, name, func_or_class, warn_once=True):
+  """Deprecate a symbol in favor of a new name with identical semantics.
+
+  This function is meant to be used when defining a backwards-compatibility
+  alias for a symbol which has been moved. For example:
+
+  module1.py:
+  ```python
+  class NewNameForClass: pass
+  ```
+
+  module2.py:
+  ```python
+  import module1
+
+  DeprecatedNameForClass = deprecated_alias(
+    deprecated_name='module2.DeprecatedNameForClass',
+    name='module1.NewNameForClass',
+    module1.NewNameForClass)
+  ```
+
+  This function works for classes and functions.
+
+  For classes, it creates a new class which is functionally identical (it
+  inherits from the original, and overrides its constructor), but which prints
+  a deprecation warning when an instance is created. It also adds a deprecation
+  notice to the class' docstring.
+
+  For functions, it returns a function wrapped by `tf_decorator.make_decorator`.
+  That function prints a warning when used, and has a deprecation notice in its
+  docstring. This is more or less equivalent (the deprecation warning has
+  slightly different text) to writing:
+
+  ```python
+  @deprecated
+  def deprecated_alias(original_args):
+    real_function(original_args)
+  ```
+
+  Args:
+    deprecated_name: The name of the symbol that is being deprecated, to be used
+      in the warning message. This should be its fully qualified name to avoid
+      confusion.
+    name: The name of the symbol that is to be used instead of the deprecated
+      name. This should be a fully qualified name to avoid confusion.
+    func_or_class: The (non-deprecated) class or function for which a deprecated
+      alias should be created.
+    warn_once: If True (the default), only print a deprecation warning the first
+      time this function is used, or the class is instantiated.
+
+  Returns:
+    A wrapped version of `func_or_class` which prints a deprecation warning on
+    use and has a modified docstring.
+  """
+  if tf_inspect.isclass(func_or_class):
+
+    # Make a new class with __init__ wrapped in a warning.
+    class NewClass(func_or_class):  # pylint: disable=missing-docstring
+      __doc__ = decorator_utils.add_notice_to_docstring(
+          func_or_class.__doc__, 'Please use %s instead.' % name,
+          'DEPRECATED CLASS',
+          '(deprecated)', ['THIS CLASS IS DEPRECATED. '
+                           'It will be removed in a future version. '])
+      __name__ = func_or_class.__name__
+      __module__ = _call_location(outer=True)
+
+      def __init__(self, *args, **kwargs):
+        if hasattr(NewClass.__init__, '__func__'):
+          # Python 2
+          NewClass.__init__.__func__.__doc__ = func_or_class.__init__.__doc__
+        else:
+          # Python 3
+          NewClass.__init__.__doc__ = func_or_class.__init__.__doc__
+
+        if _PRINT_DEPRECATION_WARNINGS:
+          # We're making the alias as we speak. The original may have other
+          # aliases, so we cannot use it to check for whether it's already been
+          # warned about.
+          if NewClass.__init__ not in _PRINTED_WARNING:
+            if warn_once:
+              _PRINTED_WARNING[NewClass.__init__] = True
+            logging.warning(
+                'From %s: The name %s is deprecated. Please use %s instead.\n',
+                _call_location(), deprecated_name, name)
+        super(NewClass, self).__init__(*args, **kwargs)
+
+    return NewClass
+  else:
+    decorator_utils.validate_callable(func_or_class, 'deprecated')
+
+    # Make a wrapper for the original
+    @functools.wraps(func_or_class)
+    def new_func(*args, **kwargs):  # pylint: disable=missing-docstring
+      if _PRINT_DEPRECATION_WARNINGS:
+        # We're making the alias as we speak. The original may have other
+        # aliases, so we cannot use it to check for whether it's already been
+        # warned about.
+        if new_func not in _PRINTED_WARNING:
+          if warn_once:
+            _PRINTED_WARNING[new_func] = True
+          logging.warning(
+              'From %s: The name %s is deprecated. Please use %s instead.\n',
+              _call_location(), deprecated_name, name)
+      return func_or_class(*args, **kwargs)
+    return tf_decorator.make_decorator(
+        func_or_class, new_func, 'deprecated',
+        _add_deprecated_function_notice_to_docstring(
+            func_or_class.__doc__, None, 'Please use %s instead.' % name))
+
+
 def deprecated(date, instructions, warn_once=True):
   """Decorator for marking functions or methods deprecated.
 
@@ -284,7 +398,9 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
-      if _PRINT_DEPRECATION_WARNINGS:
+      # TODO(apassos) figure out a way to have reasonable performance with
+      # deprecation warnings and eager mode.
+      if context.in_graph_mode() and _PRINT_DEPRECATION_WARNINGS:
         invalid_args = []
         named_args = tf_inspect.getcallargs(func, *args, **kwargs)
         for arg_name, spec in iter(deprecated_positions.items()):
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index e61edb5cfa3f8f7676b8a77d787781abdd80f310..bdd0bc48d29319914e184ea4331a5e9d4a1c3328 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -24,6 +24,56 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
 
+class DeprecatedAliasTest(test.TestCase):
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_function_alias(self, mock_warning):
+    deprecated_func = deprecation.deprecated_alias("deprecated.func",
+                                                   "real.func",
+                                                   logging.error)
+
+    logging.error("fake error logged")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_func("FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_func("ANOTHER FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_class_alias(self, mock_warning):
+    class MyClass(object):
+      """My docstring."""
+
+      init_args = []
+
+      def __init__(self, arg):
+        MyClass.init_args.append(arg)
+
+    deprecated_cls = deprecation.deprecated_alias("deprecated.cls",
+                                                  "real.cls",
+                                                  MyClass)
+
+    print(deprecated_cls.__name__)
+    print(deprecated_cls.__module__)
+    print(deprecated_cls.__doc__)
+
+    MyClass("test")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_cls("deprecated")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_cls("deprecated again")
+    self.assertEqual(1, mock_warning.call_count)
+
+    self.assertEqual(["test", "deprecated", "deprecated again"],
+                     MyClass.init_args)
+
+
 class DeprecationTest(test.TestCase):
 
   @test.mock.patch.object(logging, "warning", autospec=True)
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 874df3d1087e157f8bfcec12ba3495e341c14b7b..c8525ed42039e151f2b44c472690daf1b0727be7 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -532,8 +532,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
             (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
 
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 6bec397db577c5be5847a701ccc92367dc008fc9..8aaf799fd05420d898a53a11d65e09f3a545e69d 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -425,6 +425,10 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
   def testFlattenUpTo(self):
     # Shallow tree ends at scalar.
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index f35542e18fdba2b92f12b950e432937d0a1ef577..933c103f524ef37f840c9e13b9e4024289e274c1 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -232,7 +232,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       result = StringToDriverVersion(version);
     }
 #else
-#if !defined(PLATFORM_WINDOWS) && !defined(NVIDIA_TEGRA)
+#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
   auto iterate_phdr =
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 5210a81092b3023563baa7edbb657b630dfc819a..d71938634d6e6fe092d9a1e0861215bb101e824f 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -96,10 +96,18 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 }
 
 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
+#if defined(ANDROID_TEGRA)
+  // On Android devices the CUDA version number is not added to the library name.
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
+                                      "cupti", ""),
+                                  GetCudaCuptiLibraryPath()),
+                      dso_handle);
+#else
   return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cupti", GetCudaVersion()),
                                   GetCudaCuptiLibraryPath()),
                       dso_handle);
+#endif
 }
 
 static mutex& GetRpathMutex() {
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index a23d6a70ba237efb2a83f8f56975173015ba9a39..d1a8aae167455a7dc728999fbbaf1a119cf6a101 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -23,6 +23,14 @@ namespace gputools {
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
     const std::function<ExecutorFactory>& factory) {
+  // In the fast path case, the cache already has an entry and we can just
+  // return after Get() which only takes a shared lock and not a unique lock.
+  // If we need to create, we take a unique lock on cache_.
+  auto fast_result = Get(config);
+  if (fast_result.ok()) {
+    return fast_result;
+  }
+
   Entry* entry = nullptr;
   {
     mutex_lock lock{mutex_};
@@ -59,12 +67,17 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
     const StreamExecutorConfig& config) {
   Entry* entry = nullptr;
   {
-    mutex_lock lock{mutex_};
-    entry = &cache_[config.ordinal];
-    // Release the map lock; the address of 'entry' is stable because
-    // std::map guarantees reference stability.
+    tf_shared_lock lock{mutex_};
+    auto it = cache_.find(config.ordinal);
+    if (it != cache_.end()) {
+      entry = &it->second;
+    } else {
+      return port::Status(port::error::NOT_FOUND,
+                          port::Printf("No executors registered for ordinal %d",
+                                       config.ordinal));
+    }
   }
-  mutex_lock lock{entry->configurations_mutex};
+  tf_shared_lock lock{entry->configurations_mutex};
   if (entry->configurations.empty()) {
     return port::Status(
         port::error::NOT_FOUND,
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index cc32a6beaa5f83d6883b02682c14327b735a1caa..f23224ae772b9c5915426feaef1155fc9711f075 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -45,7 +45,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     const string& target) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformMap()->find(port::Lowercase(target));
 
   if (it == GetPlatformMap()->end()) {
@@ -59,7 +59,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformByIdMap()->find(id);
   if (it == GetPlatformByIdMap()->end()) {
     return port::Status(
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index f32d4561550c0ff60511047c87821dffe736c935..23d11c88ed687d919bd605af2f83f1ef77209370 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -10,6 +10,10 @@ load(
     "tf_additional_xla_deps_py",
     "if_static",
 )
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
@@ -197,6 +201,7 @@ def tf_copts(android_optimization_level_override="-O2", is_external=False):
           "-fno-exceptions",
           "-ftemplate-depth=900"])
       + if_cuda(["-DGOOGLE_CUDA=1"])
+      + if_tensorrt(["-DGOOGLE_TENSORRT=1"])
       + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML", "-fopenmp",])
       + if_android_arm(["-mfpu=neon"])
       + if_linux_x86_64(["-msse3"])
@@ -866,9 +871,11 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
 
   When the library is built with --config=cuda:
 
-  - both deps and cuda_deps are used as dependencies
-  - the cuda runtime is added as a dependency (if necessary)
-  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts
+  - Both deps and cuda_deps are used as dependencies.
+  - The cuda runtime is added as a dependency (if necessary).
+  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
+  - In addition, when the library is also built with TensorRT enabled, it
+      additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
 
   Args:
   - cuda_deps: BUILD dependencies which will be linked if and only if:
@@ -887,7 +894,8 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
           clean_dep("//tensorflow/core:cuda"),
           "@local_config_cuda//cuda:cuda_headers"
       ]),
-      copts=copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
+      copts=(copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
+             if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
       **kwargs)
 
 register_extension_info(
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index d11031639592aa1d3e6ce1c7f09c2f0679b29854..66bbd572a673e3ef2da9abc75348e4f70e0cea47 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -77,6 +77,16 @@ genrule(
         "api/nn/rnn_cell/__init__.py",
         "api/sets/__init__.py",
         "api/summary/__init__.py",
+        "api/train/queue_runner/__init__.py",
+        "api/compat/__init__.py",
+        "api/data/__init__.py",
+        "api/estimator/__init__.py",
+        "api/estimator/export/__init__.py",
+        "api/estimator/inputs/__init__.py",
+        "api/feature_column/__init__.py",
+        "api/losses/__init__.py",
+        "api/profiler/__init__.py",
+        "api/python_io/__init__.py",
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
index ab697b1b95b15e3ac7974e7092f1d5934b088bb6..874a73f661d782ff5637b751f104fd2209734599 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
index b73f6433e226f6b570b68c6a419c53d5c808d9d6..8da2a2b6867a3f9a3d82fcdb76ac4a62d5cee825 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
index dbcc187f94509e3c9265d59cb76d0cdd01bd2333..aa6ac46613fbead7457b19e1aae5f2532afddef1 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "mode"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "prediction_hooks"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "predictions"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
index d0bf043754b60240c507fe34b21b0599b94b69e2..76f527f796e95f342eb144ae3de87ff234338021 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
@@ -20,7 +20,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 441621a2a0cdc341fa00d8fdf06c6ad5be6c9bd2..baedf596e8fbce921ed7e0570542b8a11655dba4 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -168,6 +168,14 @@ tf_module {
     name: "rgb_to_hsv"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rgb_to_yiq"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rgb_to_yuv"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "rot90"
     argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
@@ -184,4 +192,12 @@ tf_module {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "yiq_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yuv_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 7fe3e2db09c45f26283d0da01d313405a97d0e54..2bf584fa2936990b467b2da9c48620a31814691a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -160,15 +160,15 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -228,7 +228,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42cb91445059873d9a4ed32d609129de203a764f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.applications.densenet"
+tf_module {
+  member_method {
+    name: "DenseNet121"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet169"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet201"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd75b87540533680d096853ae8645da132dd119a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.applications.nasnet"
+tf_module {
+  member_method {
+    name: "NASNetLarge"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "NASNetMobile"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
index daeb5aad419156a19f929fdd455f6c208cd7390f..9fc086eb8e17ef368b38e8d51f0ac8bf0562ca4f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.applications"
 tf_module {
+  member {
+    name: "densenet"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "inception_resnet_v2"
     mtype: "<type \'module\'>"
@@ -12,6 +16,10 @@ tf_module {
     name: "mobilenet"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "nasnet"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "resnet50"
     mtype: "<type \'module\'>"
@@ -28,6 +36,18 @@ tf_module {
     name: "xception"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "DenseNet121"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet169"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet201"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
   member_method {
     name: "InceptionResNetV2"
     argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
@@ -40,6 +60,14 @@ tf_module {
     name: "MobileNet"
     argspec: "args=[\'input_shape\', \'alpha\', \'depth_multiplier\', \'dropout\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'1\', \'0.001\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
   }
+  member_method {
+    name: "NASNetLarge"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "NASNetMobile"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
   member_method {
     name: "ResNet50"
     argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
index 44fbe0f7a04e8573a5348d626854e3b5834381dd..ba2d083a755384d4ec2076ac0dea580a1a878f1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "round"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index 8719c07ca385d2794e5c7e77f75d6d2bc734b7cb..d4c85a4519eb922629f107ef7b61c3f11cb27163 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'schedule\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'schedule\', \'verbose\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
index ef08f9b20f4c95f3692a03be7f4220f20aae9a58..bda31751d429ca0d0544402e5c496a0597e1849e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
@@ -2,6 +2,6 @@ path: "tensorflow.keras.datasets.boston_housing"
 tf_module {
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'seed\', \'test_split\'], varargs=None, keywords=None, defaults=[\'boston_housing.npz\', \'113\', \'0.2\'], "
+    argspec: "args=[\'path\', \'test_split\', \'seed\'], varargs=None, keywords=None, defaults=[\'boston_housing.npz\', \'0.2\', \'113\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
index 8b1c17e9da13a76dcc2c09f3c01a0375bf0cb9fe..ff962876b66cae013de5d711dc7eac5d5c80d8c3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
@@ -6,6 +6,6 @@ tf_module {
   }
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=None, defaults=[\'imdb.npz\', \'None\', \'0\', \'None\', \'113\', \'1\', \'2\', \'3\'], "
+    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=kwargs, defaults=[\'imdb.npz\', \'None\', \'0\', \'None\', \'113\', \'1\', \'2\', \'3\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
index 6b3ed1e9af0ea7ab4fa83c07c520adf6727a93ac..2da4a13067f2b39eb06304864ea626002300a862 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
@@ -6,6 +6,6 @@ tf_module {
   }
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'test_split\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=None, defaults=[\'reuters.npz\', \'None\', \'0\', \'None\', \'0.2\', \'113\', \'1\', \'2\', \'3\'], "
+    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'test_split\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=kwargs, defaults=[\'reuters.npz\', \'None\', \'0\', \'None\', \'0.2\', \'113\', \'1\', \'2\', \'3\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index a32151e22fab59e999c1e916e5c628d2e1b3f5ee..770a107b664d7ab0a8aedf292a34d4258a201859 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 46b1713196fdd2470aefa6227dd19cdbf93185b9..0ce42b706ec20a8ea1cc83ec95cb64d9be2e5710 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index 9bfaf2756284c7d287895e8d0b22d96ff1fa1627..b371ad148cee16dd243869d929e0c1c002794682 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 2b8ac4f1f4857eb437bc3d67cd68989d3c6842f7..2f5e65a0c58eb82c43b013f9c2fbffa4e813c4d2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -123,7 +123,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -131,7 +131,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index c9a0b887258de2d6b5aa88280b1f7b0d3bf7f6e2..ff08def0a08e5201bc01d61be3f2d66d712c384b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index b847e224d6baeb11135c51ee270f2daa2d52f8a4..6db22ca0320519fd9c101456c9c9c0e26a9a11e0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -128,7 +128,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index d898c546278188ca84a94660d9dc0c7be03e0b24..11e05f884d781166616a9c9a61dacbc8fdae6ae3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index a7001bbe34f899bdba6c49f7d2d1c7d9becc1313..58724a1e1661609ef3c000c7ca1dfe9b3235acff 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 86578d958e151b47b892b3ada0dbc745d32dbe59..07d3f023e54105c606b198c05750ffa78ee5d0c8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index 348012dcde3407dad74ea3f56842e3182098b632..92b9760d53e35d3e5066a730bb5cbda45492cc64 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index 0419251083f63cbd57244e76f35aee74db434eab..83c528b40117222ac2b3e85ad338459948d0aa8c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 337e85e812d8ef19e873dd49d39108ff3d452bbb..b329f1c46bb07ab7684dec6aaf45a20b98c27ed9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 1357dc0f0d6455b18bef0dabe08639e0dee1ab49..d0f6d2a14f936c47ef78f7bf819c13facea2efcb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -183,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -195,7 +195,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index b71a08f6c3b5e62970ba90c1d27dde5a4067e3e6..57596badf1881950270fa6d3c074afb65daaa8eb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index a01a6067efb3ce217b603da5ea9c2c17c51c8ef7..3829353cc3c195a750ad862707c5c8563e203fba 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0dbbdf283836e4121c925200749784abdeb0a5a8..3b171b137af699c9608494a17c5651b439fe4545 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 964ef89c2e2abdf8b6f7dc3893751f56dd380e90..0036d6805bb67fe20a1373927f15d8f564bdbe1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -187,7 +187,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -199,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 6a7b23c5409914396f2ce10fcb593a1ca8d65c9e..8134fb738683b79764662d9ea7f721fe04751162 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 324745e5a33de47ed91f1b5c037445ee01780ba3..c5d452300947d7f74e7458e2a04bfdfabb1c1da2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index e12ae0505440c31068f0ac132adfd675b93e0593..bcbed9241b525a953c8b499197facaefebe8cc44 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index 9e889ca8637f759d495092c9bc6862005e5e8f23..ff0db15f190675d533c50c277eb1cb60e0b95e55 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 932680941d269660533e93077818c4884c6e28c4..1d3f33f04516345ee32f16befe0d7200d2cdad00 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index db644f958f5d781c2dcc5bbbca52e3b656230510..c86bc49b22a8cc3e004a77f4a21594aacb2c665a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index 74fa1db02076f5a5cdc1feb412ea2ce5095e326d..b29f65d79dc0ffca176a8f2905df9474841f572e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -94,7 +94,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -130,7 +130,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd67b76523cc50409516e29f963f59d039455bfd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -0,0 +1,186 @@
+path: "tensorflow.keras.layers.SeparableConv1D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf62c095e7cc3fbeac95919a0f9fdc545efd3d25
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -0,0 +1,186 @@
+path: "tensorflow.keras.layers.SeparableConvolution1D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 3414810db44da6ff0e3f77b1a5db24329de7a88a..6e3cde3e3eaba4f9985411d66a220f7cdd4ee7ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index cf34034ef0abf36c0e7ff18ee8adcc8aeaeae5eb..b875898a8196e0359297f77cb10c1d8177f05d45 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -187,7 +187,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee4b2fa39ed34a544ee800e9370e4f34c4a17041
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.Softmax"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index b76499658da58c178728246b3199391ca064fa3e..db9f90caef7d6192ec7b68ddffd784a83ec0ac69 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 2376d815a6400034a51e3d17f98a030209356cf3..ef31c5443efa0c0e5a7a2e0a422d2a9c9c49baaf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index fe336c4be5a84a3764b550ca5ad2fcd1d3b85b94..088c8e88e26f59f2753733252882f5e0e8287fb6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -292,10 +292,18 @@ tf_module {
     name: "Reshape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SeparableConv1D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SeparableConv2D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SeparableConvolution1D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SeparableConvolution2D"
     mtype: "<type \'type\'>"
@@ -308,6 +316,10 @@ tf_module {
     name: "SimpleRNNCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Softmax"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpatialDropout1D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index d239098b0b2bf37fea924ed52074385acf48de96..0b816b58631d12471c2e9db96fc5395796d96ddf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -160,15 +160,15 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -228,7 +228,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
index ed040c15864b4f4c386d2d9e1f664d35d651fa14..32667cf31e4aaacf3374ca4a434f32eec5b3e07e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
index a24651429a3db49a96b217259c5c6ef09efed2f2..efca59e8e427d28de36446a49ea4e1ca0bb385eb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
index a0d978fded3825bafcd8d60e34677029495b1245..5546e2067ab65abce928d609b41b65bbc40246f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\', \'amsgrad\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'None\', \'0.0\', \'False\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
index 1b70c93ad5f0a8fd52d65fb4b8132a87878c26dd..aaa54a106066266d0a7c19f4609e4cc7ed766d95 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
index b49dbe5cf82ea838076134a0feecc120bfb88f84..1fada7fd9c6eefbb16f1b5a042e6fea607a461a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'1e-08\', \'0.004\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index c8860d80d40353211df65f08fda5deb26af91d66..fd3f97f35dcb18c82188c51345c2e3276a88f23f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
index 5bc8c4012049b0414936fb56a853fc32430df3d9..ce91caa1afe081ccf05ecdd4884a3e29ea93d496 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\'], varargs=None, keywords=None, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\'], "
+    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\', \'oov_token\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\', \'None\'], "
   }
   member_method {
     name: "fit_on_sequences"
diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b84165285102daf0a8e3dd6542bfc391e50f77b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.manip"
+tf_module {
+  member_method {
+    name: "roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index 8ce022e4549712bb13dedcd66481a3ad2a2db0e5..455590d866a4c1ebea65ccff51e34f2e0b0479d7 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -262,7 +262,7 @@ tf_module {
   }
   member_method {
     name: "sampled_softmax_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\', \'seed\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\', \'None\'], "
   }
   member_method {
     name: "selu"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index db1ed4218514ad51f28703c27598eada9464511e..e8890e9cc0a3c659b3f5f377136a2ca616d55993 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "losses"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "manip"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
@@ -2044,6 +2048,10 @@ tf_module {
     name: "unique_with_counts"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unravel_index"
+    argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index aa341b144cb8ef3c9a13635c62a7ae1be90b0994..b3a8ff2ac720b204a58deefc88cf8bb73dc78aa3 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -177,7 +177,15 @@ do_pylint() {
   echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
   echo ""
 
-  grep -E '(\[E|\[W0311|\[W0312)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # Report only what we care about
+  # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
+  # E: all errors
+  # W0311 bad-indentation
+  # W0312 mixed-indentation
+  # C0330 bad-continuation
+  # C0301 line-too-long
+  # C0326 bad-whitespace
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
@@ -313,7 +321,7 @@ do_external_licenses_check(){
   EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log"
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
- bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
+ bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -322,7 +330,7 @@ do_external_licenses_check(){
 
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
-  bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
+  bazel query "deps(${LICENSES_TARGET})" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -336,6 +344,18 @@ do_external_licenses_check(){
 
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
+  # Blacklist
+  echo ${MISSING_LICENSES_FILE}
+  grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -v ${MISSING_LICENSES_FILE} > temp.txt
+  mv temp.txt ${MISSING_LICENSES_FILE}
+
+  # Whitelist
+  echo ${EXTRA_LICENSE_FILE}
+  grep -e "@bazel_tools//src/" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  mv temp.txt ${EXTRA_LICENSES_FILE}
+
+
+
   echo
   echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
   echo
@@ -509,9 +529,14 @@ do_check_futures_test() {
   python check_futures_test.py
 }
 
+do_check_file_name_test() {
+  cd "$ROOT_DIR/tensorflow/tools/test"
+  python file_name_test.py
+}
+
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index e71017e621ccc8b42cdf8d4e4bd27a81791bbe4c..68fdb617166f70d2bddf0c472d23102960777de0 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -180,7 +180,17 @@ docstring-min-length=10
 max-line-length=80
 
 # Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+ignore-long-lines=(?x)
+  (^\s*(import|from)\s
+   |\$Id:\s\/\/depot\/.+#\d+\s\$
+   |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
+   |^\s*\#\ LINT\.ThenChange
+   |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
+   |pylint
+   |"""
+   |\#
+   |lambda
+   |(https?|ftp):)
 
 # Allow the body of an if to be on the same line as the test if there is no
 # else.
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 957729bb37db3ae49800c277f4090a52117c699d..c1bc71850754c5b4b42a6eb50be465ba8f98c218 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -36,7 +36,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 5a362de3992156fea8a5fc6ab4c70ba67ab47f89..b87e4a9bec41264827d415a11dfa6f23aeda725d 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -37,7 +37,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index fa28e3d79ca4ee5f429a41dd3e871248d5c047ca..583d1d5f09527861015458c636af2259b34d45f8 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -41,7 +41,7 @@ run_configure_for_cpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 573c926203fc76b787ba08b10bd71c8effda29b6..94276c6c5c9ce897ca24f03efe3d93e1ea1e00c9 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index fa1cc739056e7d50ace73e9ca6645b5dc04621e5..6e90b286c99f894ddd25268afc69043759571c36 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -46,8 +46,9 @@ class APIChangeSpec(object):
   """
 
 
-class _FileEditTuple(collections.namedtuple(
-    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
+class _FileEditTuple(
+    collections.namedtuple("_FileEditTuple",
+                           ["comment", "line", "start", "old", "new"])):
   """Each edit that is recorded by a _FileEditRecorder.
 
   Fields:
@@ -179,8 +180,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     function_renames = self._api_change_spec.function_renames
     try:
       new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name,
-                                                         new_name),
+      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
                           node.lineno, node.col_offset, full_name, new_name)
     except KeyError:
       pass
@@ -227,7 +227,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
       # loop over lines
       while 1:
         # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line-1]
+        text = self._lines[line - 1]
         reversed_preceding_text = text[:col][::-1]
         # First find if a [ can be found with only whitespace between it and
         # col.
@@ -236,8 +236,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
           new_col_offset = col - m.start(1) - 1
           return line, new_col_offset
         else:
-          if (reversed_preceding_text=="" or
-             reversed_preceding_text.isspace()):
+          if (reversed_preceding_text == "" or
+              reversed_preceding_text.isspace()):
             line = line - 1
             prev_line = self._lines[line - 1]
             # TODO(aselle):
@@ -248,8 +248,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # node ranges to filter out spurious #'s that appear in string
             # literals.
             comment_start = prev_line.find("#")
-            if comment_start ==  -1:
-              col = len(prev_line) -1
+            if comment_start == -1:
+              col = len(prev_line) - 1
             elif find_string_chars.search(prev_line[comment_start:]) is None:
               col = comment_start
             else:
@@ -260,7 +260,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
     # it is not possible to use that in an argument.
     return node.lineno, node.col_offset
 
-
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
 
@@ -268,7 +267,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
       node: Current Node
     """
 
-
     # Find a simple attribute name path e.g. "tf.foo.bar"
     full_name = self._get_attribute_full_path(node.func)
 
@@ -293,18 +291,21 @@ class _ASTCallVisitor(ast.NodeVisitor):
           lineno, col_offset = self._find_true_position(arg)
           if lineno is None or col_offset is None:
             self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r"
-                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
-                "", "",
+                "Failed to add keyword %r to reordered function %r" %
+                (reordered[idx], full_name),
+                arg.lineno,
+                arg.col_offset,
+                "",
+                "",
                 error="A necessary keyword argument failed to be inserted.")
           else:
             keyword_arg = reordered[idx]
             if (full_name in function_keyword_renames and
                 keyword_arg in function_keyword_renames[full_name]):
               keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r"
-                                % (reordered[idx], full_name), lineno,
-                                col_offset, "", keyword_arg + "=")
+            self._file_edit.add("Added keyword %r to reordered function %r" %
+                                (reordered[idx], full_name), lineno, col_offset,
+                                "", keyword_arg + "=")
 
       # Examine each keyword argument and convert it to the final renamed form
       renamed_keywords = ({} if full_name not in function_keyword_renames else
@@ -322,11 +323,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # value.
             key_start = argval_col_offset - len(argkey) - 1
             key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] ==
-                argkey + "="):
+            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
+                "="):
               self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey, renamed_keywords[argkey]),
-                                  argval_lineno,
+                                  (argkey,
+                                   renamed_keywords[argkey]), argval_lineno,
                                   argval_col_offset - len(argkey) - 1,
                                   argkey + "=", renamed_keywords[argkey] + "=")
               continue
@@ -335,7 +336,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
               (argkey, renamed_keywords[argkey]),
               argval.lineno,
               argval.col_offset - len(argkey) - 1,
-              "", "",
+              "",
+              "",
               error="Failed to find keyword lexographically. Fix manually.")
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -352,7 +354,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     if full_name in self._api_change_spec.change_to_function:
       if not hasattr(node, "is_function_for_call"):
         new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
                             node.lineno, node.col_offset, full_name, new_text)
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -380,8 +382,8 @@ class ASTCodeUpgrader(object):
     # Write to a temporary file, just in case we are doing an implace modify.
     with open(in_filename, "r") as in_file, \
         tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(
-          in_filename, in_file, out_filename, temp_file)
+      ret = self.process_opened_file(in_filename, in_file, out_filename,
+                                     temp_file)
 
     shutil.move(temp_file.name, out_filename)
     return ret
@@ -424,6 +426,7 @@ class ASTCodeUpgrader(object):
         out_file.write(out_text)
     text += "\n"
     return 1, text, process_errors
+
   # pylint: enable=broad-except
 
   def process_tree(self, root_directory, output_root_directory,
@@ -444,16 +447,16 @@ class ASTCodeUpgrader(object):
 
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." % (
-          output_root_directory))
+      print("Output directory %r must not already exist." %
+            (output_root_directory))
       sys.exit(1)
 
     # make sure output directory does not overlap with root_directory
     norm_root = os.path.split(os.path.normpath(root_directory))
     norm_output = os.path.split(os.path.normpath(output_root_directory))
     if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" % (
-          root_directory, output_root_directory))
+      print("Output directory %r same as input directory %r" %
+            (root_directory, output_root_directory))
       sys.exit(1)
 
     # Collect list of files to process (we do this to correctly handle if the
@@ -465,14 +468,16 @@ class ASTCodeUpgrader(object):
       copy_files = [f for f in file_list if not f.endswith(".py")]
       for filename in py_files:
         fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(
-            output_root_directory, os.path.relpath(fullpath, root_directory))
+        fullpath_output = os.path.join(output_root_directory,
+                                       os.path.relpath(fullpath,
+                                                       root_directory))
         files_to_process.append((fullpath, fullpath_output))
       if copy_other_files:
         for filename in copy_files:
           fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(
-              output_root_directory, os.path.relpath(fullpath, root_directory))
+          fullpath_output = os.path.join(output_root_directory,
+                                         os.path.relpath(
+                                             fullpath, root_directory))
           files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
@@ -641,18 +646,17 @@ class TFAPIChangeSpec(APIChangeSpec):
         "tf.concat": ["concat_dim", "values", "name"],
         "tf.svd": ["tensor", "compute_uv", "full_matrices", "name"],
         "tf.nn.softmax_cross_entropy_with_logits": [
-            "logits", "labels", "dim", "name"],
+            "logits", "labels", "dim", "name"
+        ],
         "tf.nn.sparse_softmax_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
-        "tf.nn.sigmoid_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
+            "logits", "labels", "name"
+        ],
+        "tf.nn.sigmoid_cross_entropy_with_logits": ["logits", "labels", "name"],
         "tf.op_scope": ["values", "name", "default_name"],
     }
 
     # Specially handled functions.
-    self.function_handle = {
-        "tf.reverse": self._reverse_handler
-    }
+    self.function_handle = {"tf.reverse": self._reverse_handler}
 
   @staticmethod
   def _reverse_handler(file_edit_recorder, node):
@@ -661,12 +665,13 @@ class TFAPIChangeSpec(APIChangeSpec):
     comment = ("ERROR: tf.reverse has had its argument semantics changed\n"
                "significantly the converter cannot detect this reliably, so you"
                "need to inspect this usage manually.\n")
-    file_edit_recorder.add(comment,
-                           node.lineno,
-                           node.col_offset,
-                           "tf.reverse",
-                           "tf.reverse",
-                           error="tf.reverse requires manual check.")
+    file_edit_recorder.add(
+        comment,
+        node.lineno,
+        node.col_offset,
+        "tf.reverse",
+        "tf.reverse",
+        error="tf.reverse requires manual check.")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh
index 878fabd248f3c1dd5cb79983df5220ebf5893026..225c0347416ec8c8fef855946d18e838bd767690 100755
--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@@ -16,14 +16,15 @@
 #
 # Builds the test server for distributed (GRPC) TensorFlow
 #
-# Usage: build_server.sh <docker_image_name> <whl_url> [--test]
+# Usage: build_server.sh <docker_image_name> <whl_file_location> [--test]
 #
 # Arguments:
 #   docker_image_name: Name of the docker image to build.
 #     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
 #
-#   whl_url: URL from which the TensorFlow whl file will be downloaded.
+#   whl_file_location: URL from which the TensorFlow whl file will be downloaded.
 #     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#     E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
 # testing GRPC server. Without the flag, the script will build the non-test
@@ -41,11 +42,11 @@ die() {
 
 # Check arguments
 if [[ $# -lt 2 ]]; then
-  die "Usage: $0 <docker_image_name> <whl_url> [--test]"
+  die "Usage: $0 <docker_image_name> <whl_location> [--test]"
 fi
 
 DOCKER_IMG_NAME=$1
-WHL_URL=$2
+WHL_FILE_LOCATION=$2
 shift 2
 
 # Current script directory
@@ -53,7 +54,7 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 BUILD_DIR=$(mktemp -d)
 echo ""
-echo "Using whl file URL: ${WHL_URL}"
+echo "Using whl file URL: ${WHL_FILE_LOCATION}"
 echo "Building in temporary directory: ${BUILD_DIR}"
 
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
@@ -65,9 +66,15 @@ if [[ $1 == "--test" ]]; then
 fi
 echo "Using Docker file: ${DOCKER_FILE}"
 
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+    # Download whl file into the build context directory.
+    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
+        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
+else
+    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
+fi
+
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 
 if [[ ! -f "${DOCKER_FILE}" ]]; then
   die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index e40ecb43f9a00bee7309895969ff65e48b95b4e9..a2d12442c44553a287637029843021b7541fa3fa 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Distributed MNIST training and validation, with model replicas.
 
 A simple softmax model with one hidden layer is defined. The parameters
@@ -32,7 +31,6 @@ perform forward computation and gradient calculation in parallel, which
 should lead to increased training speed for the simple model.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -45,7 +43,6 @@ import time
 import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data
 
-
 flags = tf.app.flags
 flags.DEFINE_string("data_dir", "/tmp/mnist-data",
                     "Directory for storing mnist data")
@@ -56,8 +53,7 @@ flags.DEFINE_integer("task_index", None,
                      "Worker task index, should be >= 0. task_index=0 is "
                      "the master worker task the performs the variable "
                      "initialization ")
-flags.DEFINE_integer("num_gpus", 1,
-                     "Total number of gpus for each machine."
+flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine."
                      "If you don't use GPU, please set it to '0'")
 flags.DEFINE_integer("replicas_to_aggregate", None,
                      "Number of replicas to aggregate before parameter update"
@@ -69,24 +65,24 @@ flags.DEFINE_integer("train_steps", 200,
                      "Number of (global) training steps to perform")
 flags.DEFINE_integer("batch_size", 100, "Training batch size")
 flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
-flags.DEFINE_boolean("sync_replicas", False,
-                     "Use the sync_replicas (synchronized replicas) mode, "
-                     "wherein the parameter updates from workers are aggregated "
-                     "before applied to avoid stale gradients")
+flags.DEFINE_boolean(
+    "sync_replicas", False,
+    "Use the sync_replicas (synchronized replicas) mode, "
+    "wherein the parameter updates from workers are aggregated "
+    "before applied to avoid stale gradients")
 flags.DEFINE_boolean(
     "existing_servers", False, "Whether servers already exists. If True, "
     "will use the worker hosts via their GRPC URLs (one client process "
     "per worker host). Otherwise, will create an in-process TensorFlow "
     "server.")
-flags.DEFINE_string("ps_hosts","localhost:2222",
+flags.DEFINE_string("ps_hosts", "localhost:2222",
                     "Comma-separated list of hostname:port pairs")
 flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224",
                     "Comma-separated list of hostname:port pairs")
-flags.DEFINE_string("job_name", None,"job name: worker or ps")
+flags.DEFINE_string("job_name", None, "job name: worker or ps")
 
 FLAGS = flags.FLAGS
 
-
 IMAGE_PIXELS = 28
 
 
@@ -97,7 +93,7 @@ def main(unused_argv):
 
   if FLAGS.job_name is None or FLAGS.job_name == "":
     raise ValueError("Must specify an explicit `job_name`")
-  if FLAGS.task_index is None or FLAGS.task_index =="":
+  if FLAGS.task_index is None or FLAGS.task_index == "":
     raise ValueError("Must specify an explicit `task_index`")
 
   print("job name = %s" % FLAGS.job_name)
@@ -110,9 +106,7 @@ def main(unused_argv):
   # Get the number of workers.
   num_workers = len(worker_spec)
 
-  cluster = tf.train.ClusterSpec({
-      "ps": ps_spec,
-      "worker": worker_spec})
+  cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
 
   if not FLAGS.existing_servers:
     # Not using existing servers. Create an in-process server.
@@ -217,7 +211,8 @@ def main(unused_argv):
     sess_config = tf.ConfigProto(
         allow_soft_placement=True,
         log_device_placement=False,
-        device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
+        device_filters=["/job:ps",
+                        "/job:worker/task:%d" % FLAGS.task_index])
 
     # The chief worker (task_index==0) session will prepare the session,
     # while the remaining workers will wait for the preparation to complete.
@@ -231,8 +226,7 @@ def main(unused_argv):
       server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
       print("Using existing server at: %s" % server_grpc_url)
 
-      sess = sv.prepare_or_wait_for_session(server_grpc_url,
-                                            config=sess_config)
+      sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
     else:
       sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 5dc4a053fd2cae7d83739507fea31e7afc92d77c..d16761c3675942838fd2be0ea6e0b7463a3bf249 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 96b260ad3aeb78622dd1ad276f7d524dd598e3bf..3690e7dfe57a4682276a90b10cb84c9a329b3f5e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.5
+ARG TF_BRANCH=r1.6
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 07ffd3839a32ef194100322e54b9133412e4b664..4ef37881bc91aaa58bab031c69b4a96c2a9d8ec1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
index 0acbf6fcee58b3eb14794c0f3bb8d2f6ae6e5910..05dcefb099a92683e2cd4700fff54c89c018baa6 100644
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ b/tensorflow/tools/docker/jupyter_notebook_config.py
@@ -15,6 +15,7 @@
 import os
 from IPython.lib import passwd
 
+c = c  # pylint:disable=undefined-variable
 c.NotebookApp.ip = '*'
 c.NotebookApp.port = int(os.getenv('PORT', 8888))
 c.NotebookApp.open_browser = False
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index c033c16ae98c4bcaa4c0338e539324b3a2ae5552..543b5fa6fefcd8e8dca99ad7eac7cca76781ccd3 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -162,7 +162,7 @@ def _build_class_page(page_info):
       parts.append(h3.format(**method_info.__dict__))
 
       if method_info.signature is not None:
-        parts.append(_build_signature(method_info))
+        parts.append(_build_signature(method_info, use_full_name=False))
 
       parts.append(method_info.doc.docstring)
       parts.append(_build_function_details(method_info.doc.function_details))
@@ -259,14 +259,14 @@ def _build_module_page(page_info):
   return ''.join(parts)
 
 
-def _build_signature(obj_info):
+def _build_signature(obj_info, use_full_name=True):
   """Returns a md code block showing the function signature."""
   # Special case tf.range, since it has an optional first argument
   if obj_info.full_name == 'tf.range':
     return (
         '``` python\n'
-        "range(limit, delta=1, dtype=None, name='range')\n"
-        "range(start, limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(start, limit, delta=1, dtype=None, name='range')\n"
         '```\n\n')
 
   parts = ['``` python']
@@ -281,7 +281,11 @@ def _build_signature(obj_info):
     sig = ',\n'.join('    %s' % sig_item for sig_item in obj_info.signature)
     sig = '\n'+sig+'\n'
 
-  parts.append(signature_template.format(name=obj_info.short_name, sig=sig))
+  if use_full_name:
+    obj_name = obj_info.full_name
+  else:
+    obj_name = obj_info.short_name
+  parts.append(signature_template.format(name=obj_name, sig=sig))
   parts.append('```\n\n')
 
   return '\n'.join(parts)
@@ -323,7 +327,7 @@ class _Metadata(object):
   """
 
   def __init__(self, name):
-    """Creata a Metadata builder.
+    """Create a Metadata builder.
 
     Args:
       name: The name of the page being described by the Metadata block.
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 5ccd88cfa1acfd55e90504d66417349e42fe3b50..a022f5792676c62c52fd1197b0d8c436f7161a47 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -183,22 +183,6 @@ Status ExtractRangeFromParams(const TransformFuncContext& context,
   return Status::OK();
 }
 
-bool AreAttrsEqual(const NodeDef* current_node, const NodeDef* other_node) {
-  if (current_node->attr_size() != other_node->attr_size()) {
-    return false;
-  }
-  string current_serialized;
-  string other_serialized;
-  for (const auto& attr : other_node->attr()) {
-    auto iter = current_node->attr().find(attr.first);
-    if (iter == current_node->attr().end()) return false;
-    iter->second.SerializeToString(&current_serialized);
-    attr.second.SerializeToString(&other_serialized);
-    if (current_serialized != other_serialized) return false;
-  }
-  return true;
-}
-
 }  // namespace
 
 // Analyzes all the nodes in the graph to figure out which ones are duplicates
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 96324d0deab400078fdf388bff69001f8e2df9aa..9c583d83cacc07d252699f71679e490648bc4dd3 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,9 +29,10 @@ limitations under the License.
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
-using strings::StrCat;
 using str_util::Join;
 using str_util::Split;
+using str_util::StringReplace;
+using strings::StrCat;
 
 namespace graph_transforms {
 
@@ -89,7 +91,7 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
                          string* shape_slice_string) {
   string restore_node_name;
   for (const auto& node : input_graph_def.node()) {
-    std::vector<string> node_name_parts = str_util::Split(node.name(), "/");
+    std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
         StringPiece(node_name_parts[0]).starts_with("save") &&
         StringPiece(node_name_parts[1]).starts_with("Assign") &&
@@ -119,13 +121,13 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
 }
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
-  std::vector<string> names = str_util::Split(tensor_slice_name, "/");
+  std::vector<string> names = Split(tensor_slice_name, "/");
   CHECK_GE(names.size(), 2);
   CHECK(StringPiece(names[names.size() - 1]).starts_with("part_"));
 
   // Remove the "part_x" suffix
   names.pop_back();
-  return str_util::Join(names, "/");
+  return Join(names, "/");
 }
 
 Status ReadTensorFromCheckpoint(
@@ -179,6 +181,14 @@ Status ObtainVariableInfo(
   return Status::OK();
 }
 
+Status RemoveInputAtIndex(NodeDef* n, int index) {
+  for (int i = index; i < n->input_size() - 1; i++) {
+    n->mutable_input()->SwapElements(i, i + 1);
+  }
+  n->mutable_input()->RemoveLast();
+  return Status::OK();
+}
+
 Status SparsifyGatherInternal(
     const GraphDef& input_graph_def,
     const std::unique_ptr<std::unordered_map<string, string> >&
@@ -193,6 +203,15 @@ Status SparsifyGatherInternal(
   GraphDef current_graph_def = input_graph_def;
   bool any_match_found = false;
 
+  // Populate references.
+  std::unordered_map<string, int> refs;
+  for (const auto& node : current_graph_def.node()) {
+    for (const auto& input : node.input()) {
+      auto parsed_input = StringReplace(input, "^", "", true);
+      refs[parsed_input] += 1;
+    }
+  }
+
   // The subgraphs may have overlapping components, therefore GraphMatcher
   // doesn't return all subgraphs in one round -- this has to be multi-round
   // update.
@@ -200,15 +219,15 @@ Status SparsifyGatherInternal(
     any_match_found = false;
     GraphDef replaced_graph_def = current_graph_def;
     std::vector<string> init_table_node_names;
-    std::vector<string> removed_variable_names;
+    std::vector<string> removed_node_names;
 
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def, pattern,
         [&ckpt_reader, &any_match_found, &init_table_node_names,
-         &shapes_and_slices, &removed_variable_names](
-            const NodeMatch& match, const std::set<string>& input_nodes,
-            const std::set<string>& output_nodes,
-            std::vector<NodeDef>* new_nodes) {
+         &shapes_and_slices, &removed_node_names,
+         &refs](const NodeMatch& match, const std::set<string>& input_nodes,
+                const std::set<string>& output_nodes,
+                std::vector<NodeDef>* new_nodes) {
           any_match_found = true;
 
           // The captured subgraph should be of the following pattern:
@@ -290,9 +309,13 @@ Status SparsifyGatherInternal(
             TF_RETURN_IF_ERROR(ReadTensorFromCheckpoint(
                 weights_node.name(), ckpt_reader,
                 (*shapes_and_slices)[weights_node.name()], &weight));
-            // Add both both weight and identity node names.
-            removed_variable_names.push_back(weights_node.name());
-            removed_variable_names.push_back(match.inputs[0].node.name());
+          }
+          // Add both both weight and identity node names.
+          removed_node_names.push_back(weights_node.name());
+          removed_node_names.push_back(match.inputs[0].node.name());
+          for (auto input_node : match.inputs[0].node.input()) {
+            auto parsed_input = StringReplace(input_node, "^", "", true);
+            refs[parsed_input]--;
           }
           Tensor indices_tensor;
           Tensor values_tensor;
@@ -362,15 +385,23 @@ Status SparsifyGatherInternal(
 
           // Connect nodes
           AddNodeInput(hashtable_node.name(), &init_table_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(indices_node.name(), &init_table_node);
+          refs[indices_node.name()]++;
           AddNodeInput(values_node.name(), &init_table_node);
+          refs[values_node.name()]++;
 
           AddNodeInput(hashtable_node.name(), &lookup_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(gather_node.input(1), &lookup_node);
+          refs[gather_node.input(1)]++;
           AddNodeInput(default_value_node.name(), &lookup_node);
+          refs[default_value_node.name()]++;
 
           AddNodeInput(lookup_node.name(), &expand_dims_node);
+          refs[lookup_node.name()]++;
           AddNodeInput(dim_idx_node.name(), &expand_dims_node);
+          refs[dim_idx_node.name()]++;
 
           // Copy 'ids' input of original 'Gather'
           new_nodes->push_back(match.inputs[1].node);
@@ -404,47 +435,90 @@ Status SparsifyGatherInternal(
     for (const string& name : init_table_node_names) {
       // Add control dependence from init_table_node to group_deps_node
       AddNodeInput(StrCat("^", name), init_op);
+      refs[name]++;
+    }
+
+    // Erase inputs and outputs as they are not considered for deletion.
+    for (const auto& output : context.output_names) {
+      refs.erase(output);
+    }
+
+    for (const auto& input : context.input_names) {
+      refs.erase(input);
+    }
+
+    // Add nodes with a reference count of 0 for deletion.
+    for (auto entry : refs) {
+      if (entry.second == 0) {
+        removed_node_names.push_back(entry.first);
+      }
     }
 
-    // Remove all dependencies associated with removed variables.
-    while (!removed_variable_names.empty()) {
-      auto name = removed_variable_names.back();
-      removed_variable_names.pop_back();
+    while (!removed_node_names.empty()) {
+      auto name = removed_node_names.back();
+      removed_node_names.pop_back();
+
       int i = 0;
       while (i < replaced_graph_def.node_size()) {
-        if (!replaced_graph_def.node(i).input_size()) {
-          if (replaced_graph_def.node(i).name() == name) {
-            replaced_graph_def.mutable_node()->SwapElements(
-                i, replaced_graph_def.node_size() - 1);
-            replaced_graph_def.mutable_node()->RemoveLast();
-            continue;
+        // Revisit this to see if we can safely remove RestoreV2 nodes.
+        if ((replaced_graph_def.node(i).name() == name) &&
+            (replaced_graph_def.node(i).op() != "RestoreV2")) {
+          for (const auto& input : replaced_graph_def.node(i).input()) {
+            auto parsed_input = StringReplace(input, "^", "", true);
+            refs[parsed_input] -= 1;
+            if (refs[parsed_input] == 0) {
+              removed_node_names.push_back(parsed_input);
+            }
           }
-          i++;
+          replaced_graph_def.mutable_node()->SwapElements(
+              i, replaced_graph_def.node_size() - 1);
+          replaced_graph_def.mutable_node()->RemoveLast();
           continue;
         }
         int j = 0;
+        bool deleted_inputs = false;
         while (j < replaced_graph_def.node(i).input_size()) {
           if (replaced_graph_def.node(i).input(j) == name ||
               replaced_graph_def.node(i).input(j) == ("^" + name)) {
-            replaced_graph_def.mutable_node(i)->mutable_input()->SwapElements(
-                j, replaced_graph_def.node(i).input_size() - 1);
-            replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
+            TF_RETURN_IF_ERROR(
+                RemoveInputAtIndex(replaced_graph_def.mutable_node(i), j));
+            deleted_inputs = true;
             continue;
           }
           j++;
         }
-        if ((replaced_graph_def.node(i).input_size() == 0) ||
-            (replaced_graph_def.node(i).op() == "Assign" &&
-             replaced_graph_def.node(i).input_size() == 1)) {
-          removed_variable_names.push_back(replaced_graph_def.node(i).name());
-          if (replaced_graph_def.node(i).input_size() == 1) {
-            removed_variable_names.push_back(
-                replaced_graph_def.node(i).input(0));
+        if (deleted_inputs) {
+          if (replaced_graph_def.node(i).op() == "ConcatV2") {
+            if (replaced_graph_def.node(i).input_size() > 2) {
+              SetNodeAttr("N", replaced_graph_def.node(i).input_size() - 1,
+                          replaced_graph_def.mutable_node(i));
+            } else if (replaced_graph_def.node(i).input_size() == 2) {
+              if (refs[replaced_graph_def.node(i).input(1)] != 1) {
+                return errors::Internal(
+                    "Expect axis tensor of ConcatV2 node to only be referenced "
+                    "once.");
+              }
+              refs[replaced_graph_def.node(i).input(1)] -= 1;
+              removed_node_names.push_back(replaced_graph_def.node(i).input(1));
+              replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
+              replaced_graph_def.mutable_node(i)->mutable_attr()->erase("N");
+              replaced_graph_def.mutable_node(i)->set_op("Identity");
+            } else {
+              return errors::Internal(
+                  "ConcatV2 should have at least two elements");
+            }
+          }
+          if ((replaced_graph_def.node(i).op() == "Assign" ||
+               replaced_graph_def.node(i).op() == "Reshape" ||
+               replaced_graph_def.node(i).op() == "Equal" ||
+               replaced_graph_def.node(i).op() == "Mean" ||
+               replaced_graph_def.node(i).op() == "ScalarSummary") &&
+              replaced_graph_def.node(i).input_size() == 1) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
+          }
+          if (!replaced_graph_def.node(i).input_size()) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
           }
-          replaced_graph_def.mutable_node()->SwapElements(
-              i, replaced_graph_def.node_size() - 1);
-          replaced_graph_def.mutable_node()->RemoveLast();
-          continue;
         }
         i++;
       }
@@ -485,17 +559,22 @@ Status SparsifyGather(const GraphDef& input_graph_def,
     };
   // clang-format on
 
+  GraphDef cleaned_input_graph_def;
+  RemoveAttributes(input_graph_def, {"_output_shapes"},
+                   &cleaned_input_graph_def);
+
   GraphDef temp_output;
 
   std::unique_ptr<BundleReader> ckpt_reader;
   TF_RETURN_IF_ERROR(InitializeCheckpointReader(context, &ckpt_reader));
 
   std::unique_ptr<std::unordered_map<string, string> > shapes_and_slices;
-  TF_RETURN_IF_ERROR(ObtainVariableInfo(input_graph_def, &shapes_and_slices));
+  TF_RETURN_IF_ERROR(
+      ObtainVariableInfo(cleaned_input_graph_def, &shapes_and_slices));
 
-  TF_RETURN_IF_ERROR(SparsifyGatherInternal(input_graph_def, shapes_and_slices,
-                                            context, gather_pattern,
-                                            ckpt_reader, &temp_output));
+  TF_RETURN_IF_ERROR(SparsifyGatherInternal(
+      cleaned_input_graph_def, shapes_and_slices, context, gather_pattern,
+      ckpt_reader, &temp_output));
 
   TF_RETURN_IF_ERROR(SparsifyGatherInternal(temp_output, shapes_and_slices,
                                             context, gather_v2_pattern,
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 000568a0cc9aceffa927abb1dc56e6586030fea0..203ed3e0f9c3d1a02128bba770879e69cddb6501 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -71,7 +71,7 @@ class SparsifyGatherTest : public ::testing::Test {
   }
 
   void TestSinglePartition(bool gather_v2, bool include_shared_init,
-                           bool test_variable,
+                           bool test_variable, bool test_kept_concat,
                            const string& shared_init_name = "group_deps") {
     GraphDef graph_def;
 
@@ -80,6 +80,8 @@ class SparsifyGatherTest : public ::testing::Test {
     // Build the graph.
     NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def);
     NodeDef* w_node;
+    NodeDef* zeros_const;
+    NodeDef* zeros_shape;
     NodeDef* zeros_node;
     NodeDef* assign_node;
 
@@ -92,8 +94,12 @@ class SparsifyGatherTest : public ::testing::Test {
     } else {
       w_node = CreateNode("w/part_1", "VariableV2", {}, &graph_def);
 
-      zeros_node =
-          CreateNode("w/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape = CreateNode("w/part_1/Initializer/zeros/shape_as_tensor",
+                               "Const", {}, &graph_def);
+      zeros_const = CreateNode("w/part_1/Initializer/zeros/Const", "Const", {},
+                               &graph_def);
+      zeros_node = CreateNode("w/part_1/Initializer/zeros", "Fill",
+                              {zeros_shape, zeros_const}, &graph_def);
       assign_node = CreateNode("w/part_1/Assign", "Assign",
                                {w_node, zeros_node}, &graph_def);
 
@@ -133,6 +139,26 @@ class SparsifyGatherTest : public ::testing::Test {
       }
     }
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_input_node =
+        CreateNode("concat/input/node", "Const", {}, &graph_def);
+    NodeDef* concat_node = nullptr;
+    if (!test_kept_concat) {
+      concat_node = CreateNode(
+          "concat/node", "ConcatV2",
+          {identity_node, concat_input_node, concat_axis_node}, &graph_def);
+      SetNodeAttr("N", 2, concat_node);
+    } else {
+      NodeDef* concat_input_node_2 =
+          CreateNode("concat/input/node_2", "Const", {}, &graph_def);
+      concat_node = CreateNode("concat/node", "ConcatV2",
+                               {identity_node, concat_input_node,
+                                concat_input_node_2, concat_axis_node},
+                               &graph_def);
+      SetNodeAttr("N", 3, concat_node);
+    }
+
     // Run the op.
     GraphDef result;
     TransformFuncContext context;
@@ -151,12 +177,32 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Assign"));
 
     EXPECT_EQ(1, node_lookup.count("ids"));
     EXPECT_EQ("Const", node_lookup.at("ids")->op());
 
+    EXPECT_EQ(1, node_lookup.count("concat/node"));
+
+    if (!test_kept_concat) {
+      EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("Identity", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(1, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+    } else {
+      EXPECT_EQ(1, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("ConcatV2", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(3, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+      EXPECT_EQ("concat/input/node_2", node_lookup.at("concat/node")->input(1));
+      EXPECT_EQ("linear/concat/axis", node_lookup.at("concat/node")->input(2));
+      EXPECT_EQ(2, node_lookup.at("concat/node")->attr().at("N").i());
+    }
+
     EXPECT_EQ(1, node_lookup.count("w/part_1/indices"));
     EXPECT_EQ("Const", node_lookup.at("w/part_1/indices")->op());
     Tensor expected_indices_tensor(DT_INT64, TensorShape({3}));
@@ -247,7 +293,11 @@ class SparsifyGatherTest : public ::testing::Test {
     // Two partitions
     NodeDef* w_node1;
     NodeDef* w_node2;
+    NodeDef* zeros_const1;
+    NodeDef* zeros_shape1;
     NodeDef* zeros_node1;
+    NodeDef* zeros_const2;
+    NodeDef* zeros_shape2;
     NodeDef* zeros_node2;
     NodeDef* assign_node1;
     NodeDef* assign_node2;
@@ -261,8 +311,13 @@ class SparsifyGatherTest : public ::testing::Test {
       SetNodeTensorAttr<float>("value", weights, w_node2);
     } else {
       w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def);
-      zeros_node1 =
-          CreateNode("w1/part_1/Initializer/zeros", "Const", {}, &graph_def);
+
+      zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const1 = CreateNode("w1/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node1 = CreateNode("w1/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape1, zeros_const1}, &graph_def);
       assign_node1 = CreateNode("w1/part_1/Assign", "Assign",
                                 {w_node1, zeros_node1}, &graph_def);
 
@@ -285,8 +340,12 @@ class SparsifyGatherTest : public ::testing::Test {
       CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def);
 
       w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def);
-      zeros_node2 =
-          CreateNode("w2/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const2 = CreateNode("w2/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node2 = CreateNode("w2/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape2, zeros_const2}, &graph_def);
       assign_node2 = CreateNode("w2/part_1/Assign", "Assign",
                                 {w_node2, zeros_node2}, &graph_def);
 
@@ -322,6 +381,13 @@ class SparsifyGatherTest : public ::testing::Test {
     MakeGather("gather1", gather_v2, identity_node1, input_node, &graph_def);
     MakeGather("gather2", gather_v2, identity_node2, input_node, &graph_def);
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_node = CreateNode(
+        "concat/node", "ConcatV2",
+        {identity_node1, identity_node2, concat_axis_node}, &graph_def);
+    SetNodeAttr("N", 2, concat_node);
+
     // Shared init node
     if (include_shared_init) {
       if (!test_variable) {
@@ -350,8 +416,14 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w1/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Assign"));
+    EXPECT_EQ(0,
+              node_lookup.count("w2/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Assign"));
     EXPECT_EQ(1, node_lookup.count("ids"));
@@ -487,6 +559,9 @@ class SparsifyGatherTest : public ::testing::Test {
               node_lookup.at("gather2/LookupTableFind")->input(2));
     EXPECT_EQ("gather2/LookupTableFind", node_lookup.at("gather2")->input(0));
 
+    EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+    EXPECT_EQ(0, node_lookup.count("concat/node"));
+
     // Check control deps.
     EXPECT_EQ(2, node_lookup.at(shared_init_name)->input_size());
     EXPECT_NE(std::find(node_lookup.at(shared_init_name)->input().begin(),
@@ -522,18 +597,31 @@ class SparsifyGatherTest : public ::testing::Test {
 };
 
 TEST_F(SparsifyGatherTest, TestSinglePartition) {
-  TestSinglePartition(false, false, false);
-  TestSinglePartition(false, true, false);
-  TestSinglePartition(true, false, false);
-  TestSinglePartition(true, true, false);
-  TestSinglePartition(false, false, true);
-  TestSinglePartition(false, true, true);
-  TestSinglePartition(true, false, true);
-  TestSinglePartition(true, true, true);
-  TestSinglePartition(false, true, false, "shared_inits");
-  TestSinglePartition(true, true, false, "shared_inits");
-  TestSinglePartition(false, true, true, "shared_inits");
-  TestSinglePartition(true, true, true, "shared_inits");
+  TestSinglePartition(false, false, false, false);
+  TestSinglePartition(false, true, false, false);
+  TestSinglePartition(true, false, false, false);
+  TestSinglePartition(true, true, false, false);
+  TestSinglePartition(false, false, true, false);
+  TestSinglePartition(false, true, true, false);
+  TestSinglePartition(true, false, true, false);
+  TestSinglePartition(true, true, true, false);
+  TestSinglePartition(false, true, false, false, "shared_inits");
+  TestSinglePartition(true, true, false, false, "shared_inits");
+  TestSinglePartition(false, true, true, false, "shared_inits");
+  TestSinglePartition(true, true, true, false, "shared_inits");
+
+  TestSinglePartition(false, false, false, true);
+  TestSinglePartition(false, true, false, true);
+  TestSinglePartition(true, false, false, true);
+  TestSinglePartition(true, true, false, true);
+  TestSinglePartition(false, false, true, true);
+  TestSinglePartition(false, true, true, true);
+  TestSinglePartition(true, false, true, true);
+  TestSinglePartition(true, true, true, true);
+  TestSinglePartition(false, true, false, true, "shared_inits");
+  TestSinglePartition(true, true, false, true, "shared_inits");
+  TestSinglePartition(false, true, true, true, "shared_inits");
+  TestSinglePartition(true, true, true, true, "shared_inits");
 }
 
 TEST_F(SparsifyGatherTest, TestMultiPartition) {
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index dbc81599de8539ce58933f9d40bf99fcae8f8e67..7717d8d7de27e827aab5208404f2e2275d60c8d3 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -99,6 +99,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -114,6 +115,7 @@ genrule(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
@@ -134,6 +136,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -149,6 +152,7 @@ genrule(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c789e2ba0cbc87ac0991fdf705d31aa05af794b6..a9c4a8de42a7633b09985cdd4470495c2c4749e2 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -88,13 +88,20 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl/flags:LICENSE",
+        "@arm_neon_2_x86_sse//:LICENSE",
+        "@astor_archive//:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
+        "@com_google_absl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
+        "@flatbuffers//:LICENSE.txt",
+        "@gast_archive//:PKG-INFO",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
@@ -105,11 +112,15 @@ filegroup(
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
+        "@swig//:LICENSE",
+        "@termcolor_archive//:COPYING.txt",
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
@@ -151,8 +162,10 @@ sh_binary(
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
-            "//tensorflow/contrib/py2tf:py2tf_internal",
-            "//tensorflow/contrib/py2tf/convert:convert",
+            "//tensorflow/contrib/py2tf:py2tf",
+            "//tensorflow/contrib/py2tf/converters:converters",
+            "//tensorflow/contrib/py2tf/converters:test_lib",
+            "//tensorflow/contrib/py2tf/impl:impl",
             "//tensorflow/contrib/py2tf/pyct:pyct",
             "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis",
             "//tensorflow/contrib/receptive_field:receptive_field_pip",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 38a900738786e2413f5b1dd914caaebeafc92e21..73d759eb130633094b402c821cc32eb76c076a44 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -65,7 +65,6 @@ BLACKLIST = [
     "//tensorflow/contrib/framework:checkpoint_ops_testdata",
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
     "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
-    "//tensorflow/contrib/py2tf:py2tf_internal",
     "//tensorflow/contrib/timeseries/examples:predict",
     "//tensorflow/contrib/timeseries/examples:multivariate",
     "//tensorflow/contrib/timeseries/examples:known_anomaly",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 62df6453fb5d39728c2985a28a70a263d79804b1..20027869990013098c405b4707318a3ce63000fc 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,16 +29,17 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.5.0-rc1'
+_VERSION = '1.6.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
+    'grpcio >= 1.8.6',
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard >= 0.4.0',
+    'tensorflow-tensorboard >= 1.5.0, < 1.6.0',
     'termcolor >= 1.1.0',
 ]
 
@@ -79,13 +80,13 @@ CONSOLE_SCRIPTS = [
     # is now declared by the tensorboard pip package. If we remove the
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
-    'tensorboard = tensorboard.main:main',
+    'tensorboard = tensorboard.main:run_main',
 ]
 # pylint: enable=line-too-long
 
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
-  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:main')
+  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
 
 TEST_PACKAGES = [
     'scipy >= 0.15.1',
diff --git a/tensorflow/tools/test/file_name_test.py b/tensorflow/tools/test/file_name_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..16fb8a822d09ed136cf79dd2473fc202ca632d83
--- /dev/null
+++ b/tensorflow/tools/test/file_name_test.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Test that checks if we have any issues with case insensitive filesystems.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
+ERROR_MESSAGE = """
+Files with same name but different case detected in directory: {}
+"""
+
+
+def main():
+  # Make sure BASE_DIR ends with tensorflow.  If it doesn't, we probably
+  # computed the wrong directory.
+  if os.path.split(BASE_DIR)[-1] != 'tensorflow':
+    raise AssertionError(
+        "BASE_DIR = '%s' doesn't end with tensorflow" % BASE_DIR)
+
+  for dirpath, dirnames, filenames in os.walk(BASE_DIR, followlinks=True):
+    lowercase_directories = [x.lower() for x in dirnames]
+    lowercase_files = [x.lower() for x in filenames]
+
+    lowercase_dir_contents = lowercase_directories + lowercase_files
+    if len(lowercase_dir_contents) != len(set(lowercase_dir_contents)):
+      raise AssertionError(ERROR_MESSAGE.format(dirpath))
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index a953ed1b53d13504f92d2ffeb4c1ac6bcb0b8477..3b4921bb983a72223b092d99eb3fb59332fc6345 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -136,7 +136,7 @@ def run_and_gather_logs(name, test_name, test_args,
   gpu_config = gpu_info_lib.gather_gpu_devices()
   if gpu_config:
     gpu_name = gpu_config[0].model
-    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100)", gpu_name)
+    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100|V100)", gpu_name)
     if gpu_short_name_match:
       gpu_short_name = gpu_short_name_match.group(0)
       test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9145d9e58a3df6c074d5ac44a665a33339c45cc6..b6bba78401d043d0a71e9c8089963ef20a13bbb2 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,6 +1,7 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
@@ -68,6 +69,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   check_bazel_version_at_least("0.5.4")
   clang6_configure(name="local_config_clang6")
   cuda_configure(name="local_config_cuda")
+  tensorrt_configure(name="local_config_tensorrt")
   git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
@@ -112,6 +114,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
      sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
      strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
+     build_file = str(Label("//third_party:com_google_absl.BUILD")),
   )
 
   tf_http_archive(
@@ -350,16 +353,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "protobuf_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
-      # TODO: remove patching when tensorflow stops linking same protos into
-      #       multiple shared libraries loaded in runtime by python.
-      #       This patch fixes a runtime crash when tensorflow is compiled
-      #       with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394)
-      patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   # We need to import the protobuf library under the names com_google_protobuf
@@ -368,21 +366,21 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "com_google_protobuf",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   tf_http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   tf_http_archive(
@@ -475,11 +473,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/f135378ec6365e852f7d5a3cfcdce342f08cb5f3.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/f135378ec6365e852f7d5a3cfcdce342f08cb5f3.tar.gz",
       ],
-      sha256 = "b5429ccf8d57273cb8489714f728c997cd720ec66fc2c0292422ab8f0e729ce0",
-      strip_prefix = "llvm-11a2ca6eea8a7fe240a14c0c35fd2017341279be",
+      sha256 = "296ab832167e6c46eb65ef1f9a2b5fc31c77fcd2248799b306aa2d5d2e4edbfe",
+      strip_prefix = "llvm-f135378ec6365e852f7d5a3cfcdce342f08cb5f3",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
@@ -558,6 +556,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:nccl.BUILD")),
   )
 
+  tf_http_archive(
+      name = "kafka",
+      urls = [
+          "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+          "https://github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+      ],
+      sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e",
+      strip_prefix = "librdkafka-0.11.1",
+      build_file = str(Label("//third_party:kafka/BUILD")),
+      patch_file = str(Label("//third_party/kafka:config.patch")),
+  )
+
   tf_http_archive(
       name = "aws",
       urls = [
diff --git a/third_party/com_google_absl.BUILD b/third_party/com_google_absl.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0c8d327c1f3294dfdcb85195b06724b6c811548b
--- /dev/null
+++ b/third_party/com_google_absl.BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index f6b8e6ddb05e67a4bb4833a3bba6db3cbd4c79e0..824c97be60e7ef148a363b964ed330ba3c5fcb0c 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -4,6 +4,8 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE.txt"])
+
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
diff --git a/third_party/gast.BUILD b/third_party/gast.BUILD
index 06db528ada27e2f26f6de48c1ce6e9348ce09173..4866982e1fda6d6f19e575c8b0c0273cb9de154b 100644
--- a/third_party/gast.BUILD
+++ b/third_party/gast.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # BSD 3-clause
 
-exports_files(["LICENSE"])
+exports_files(["PKG-INFO"])
 
 py_library(
     name = "gast",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 2727fa5efe5329b5e5b437b48d2b340b12ea5b10..255ae0119095ee17babc00f43e93fa4c4931c1fb 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -236,7 +236,7 @@ def _cudnn_install_basedir(repository_ctx):
   return cudnn_install_path
 
 
-def _matches_version(environ_version, detected_version):
+def matches_version(environ_version, detected_version):
   """Checks whether the user-specified version matches the detected version.
 
   This function performs a weak matching so that if the user specifies only the
@@ -317,7 +317,7 @@ def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
   environ_version = ""
   if _TF_CUDA_VERSION in repository_ctx.os.environ:
     environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-  if environ_version and not _matches_version(environ_version, full_version):
+  if environ_version and not matches_version(environ_version, full_version):
     auto_configure_fail(
         ("CUDA version detected from nvcc (%s) does not match " +
          "TF_CUDA_VERSION (%s)") % (full_version, environ_version))
@@ -338,35 +338,49 @@ _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
 
-def _find_cuda_define(repository_ctx, cudnn_header_dir, define):
-  """Returns the value of a #define in cudnn.h
+def find_cuda_define(repository_ctx, header_dir, header_file, define):
+  """Returns the value of a #define in a header file.
 
-  Greps through cudnn.h and returns the value of the specified #define. If the
-  #define is not found, then raise an error.
+  Greps through a header file and returns the value of the specified #define.
+  If the #define is not found, then raise an error.
 
   Args:
     repository_ctx: The repository context.
-    cudnn_header_dir: The directory containing the cuDNN header.
+    header_dir: The directory containing the header file.
+    header_file: The header file name.
     define: The #define to search for.
 
   Returns:
-    The value of the #define found in cudnn.h.
+    The value of the #define found in the header.
   """
-  # Confirm location of cudnn.h and grep for the line defining CUDNN_MAJOR.
-  cudnn_h_path = repository_ctx.path("%s/cudnn.h" % cudnn_header_dir)
-  if not cudnn_h_path.exists:
-    auto_configure_fail("Cannot find cudnn.h at %s" % str(cudnn_h_path))
-  result = repository_ctx.execute(["grep", "--color=never", "-E", define, str(cudnn_h_path)])
+  # Confirm location of the header and grep for the line defining the macro.
+  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+  if not h_path.exists:
+    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+  result = repository_ctx.execute(
+      # Grep one more lines as some #defines are splitted into two lines.
+      ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
   if result.stderr:
-    auto_configure_fail("Error reading %s: %s" %
-                        (result.stderr, str(cudnn_h_path)))
+    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
 
-  # Parse the cuDNN major version from the line defining CUDNN_MAJOR
-  lines = result.stdout.splitlines()
-  if len(lines) == 0 or lines[0].find(define) == -1:
+  # Parse the version from the line defining the macro.
+  if result.stdout.find(define) == -1:
     auto_configure_fail("Cannot find line containing '%s' in %s" %
-                        (define, str(cudnn_h_path)))
-  return lines[0].replace(define, "").strip()
+                        (define, h_path))
+  version = result.stdout
+  # Remove the new line and '\' character if any.
+  version = version.replace("\\", " ")
+  version = version.replace("\n", " ")
+  version = version.replace(define, "").lstrip()
+  # Remove the code after the version number.
+  version_end = version.find(" ")
+  if version_end != -1:
+    if version_end == 0:
+      auto_configure_fail(
+          "Cannot extract the version from line containing '%s' in %s" %
+          (define, str(h_path)))
+    version = version[:version_end].strip()
+  return version
 
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
@@ -382,12 +396,12 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
   """
   cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
                                             cudnn_install_basedir)
-  major_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_MAJOR)
-  minor_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_MINOR)
-  patch_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_PATCHLEVEL)
+  major_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MAJOR)
+  minor_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MINOR)
+  patch_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_PATCHLEVEL)
   full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
 
   # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
@@ -395,7 +409,7 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
   environ_version = ""
   if _TF_CUDNN_VERSION in repository_ctx.os.environ:
     environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-  if environ_version and not _matches_version(environ_version, full_version):
+  if environ_version and not matches_version(environ_version, full_version):
     cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
                                        cudnn_install_basedir)
     auto_configure_fail(
@@ -427,7 +441,7 @@ def _compute_capabilities(repository_ctx):
   return capabilities
 
 
-def _cpu_value(repository_ctx):
+def get_cpu_value(repository_ctx):
   """Returns the name of the host operating system.
 
   Args:
@@ -447,7 +461,7 @@ def _cpu_value(repository_ctx):
 
 def _is_windows(repository_ctx):
   """Returns true if the host operating system is windows."""
-  return _cpu_value(repository_ctx) == "Windows"
+  return get_cpu_value(repository_ctx) == "Windows"
 
 def _lib_name(lib, cpu_value, version="", static=False):
   """Constructs the platform-specific name of a library.
@@ -582,11 +596,8 @@ def _find_libs(repository_ctx, cuda_config):
     cuda_config: The CUDA config as returned by _get_cuda_config
 
   Returns:
-    Map of library names to structs of filename and path as returned by
-    _find_cuda_lib and _find_cupti_lib.
+    Map of library names to structs of filename and path.
   """
-  cudnn_version = cuda_config.cudnn_version
-  cudnn_ext = ".%s" % cudnn_version if cudnn_version else ""
   cpu_value = cuda_config.cpu_value
   return {
       "cuda": _find_cuda_lib("cuda", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path),
@@ -611,7 +622,7 @@ def _find_libs(repository_ctx, cuda_config):
       "cudnn": _find_cuda_lib(
           "cudnn", repository_ctx, cpu_value, cuda_config.cudnn_install_basedir,
           cuda_config.cudnn_version),
-      "cupti": _find_cupti_lib(repository_ctx, cuda_config),
+      "cupti": _find_cupti_lib(repository_ctx, cuda_config)
   }
 
 
@@ -654,7 +665,7 @@ def _get_cuda_config(repository_ctx):
       compute_capabilities: A list of the system's CUDA compute capabilities.
       cpu_value: The name of the host operating system.
   """
-  cpu_value = _cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
   cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
   cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
   cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
@@ -712,13 +723,13 @@ error_gpu_disabled()
 
 
 def _create_dummy_repository(repository_ctx):
-  cpu_value = _cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
 
   # Set up BUILD file for cuda/.
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
            "%{cuda_is_configured}": "False",
-           "%{cuda_extra_copts}": "[]"
+           "%{cuda_extra_copts}": "[]",
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -805,8 +816,8 @@ def _norm_path(path):
   return path
 
 
-def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
-    src_files = [], dest_files = []):
+def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+                            src_files = [], dest_files = []):
   """Returns a genrule to symlink(or copy if on Windows) a set of files.
 
   If src_dir is passed, files will be read from the given directory; otherwise
@@ -815,7 +826,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
@@ -913,11 +924,11 @@ def _create_local_cuda_repository(repository_ctx):
   # cuda_toolkit_path
   cuda_toolkit_path = cuda_config.cuda_toolkit_path
   cuda_include_path = cuda_toolkit_path + "/include"
-  genrules = [_symlink_genrule_for_dir(repository_ctx,
+  genrules = [symlink_genrule_for_dir(repository_ctx,
       cuda_include_path, "cuda/include", "cuda-include")]
-  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+  genrules.append(symlink_genrule_for_dir(repository_ctx,
       cuda_toolkit_path + "/nvvm", "cuda/nvvm", "cuda-nvvm"))
-  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+  genrules.append(symlink_genrule_for_dir(repository_ctx,
       cuda_toolkit_path + "/extras/CUPTI/include",
       "cuda/extras/CUPTI/include", "cuda-extras"))
 
@@ -927,15 +938,15 @@ def _create_local_cuda_repository(repository_ctx):
   for lib in cuda_libs.values():
     cuda_lib_src.append(lib.path)
     cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-  genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
-                                       cuda_lib_src, cuda_lib_dest))
+  genrules.append(symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
+                                          cuda_lib_src, cuda_lib_dest))
 
-  # Set up the symbolic links for cudnn if cudnn was was not installed to
+  # Set up the symbolic links for cudnn if cndnn was not installed to
   # CUDA_TOOLKIT_PATH.
   included_files = _read_dir(repository_ctx, cuda_include_path).replace(
       cuda_include_path, '').splitlines()
   if '/cudnn.h' not in included_files:
-    genrules.append(_symlink_genrule_for_dir(repository_ctx, None,
+    genrules.append(symlink_genrule_for_dir(repository_ctx, None,
         "cuda/include/", "cudnn-include", [cudnn_header_dir + "/cudnn.h"],
         ["cudnn.h"]))
   else:
@@ -952,7 +963,6 @@ def _create_local_cuda_repository(repository_ctx):
            "%{cuda_is_configured}": "True",
            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
                repository_ctx, cuda_config.compute_capabilities),
-
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 37924125cf8f9d2fde6a66478432a35826966974..ca2d38d6878cee81b29e949175c5133f492cf06b 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -34,6 +34,10 @@ libjpegturbo_copts = select({
         "-mfloat-abi=softfp",
         "-fprefetch-loop-arrays",
     ],
+    ":linux_ppc64le": [
+        "-mcpu=power8",
+        "-mtune=power8",
+    ],
     "//conditions:default": [],
 })
 
@@ -123,10 +127,50 @@ cc_library(
         ":k8": [":simd_x86_64"],
         ":armeabi-v7a": [":simd_armv7a"],
         ":arm64-v8a": [":simd_armv8a"],
+        ":linux_ppc64le": [":simd_altivec"],
         "//conditions:default": [":simd_none"],
     }),
 )
 
+cc_library(
+    name = "simd_altivec",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/jsimd.h",
+        "simd/jccolor-altivec.c",
+        "simd/jcgray-altivec.c",
+        "simd/jcsample-altivec.c",
+        "simd/jdcolor-altivec.c",
+        "simd/jdmerge-altivec.c",
+        "simd/jdsample-altivec.c",
+        "simd/jfdctfst-altivec.c",
+        "simd/jfdctint-altivec.c",
+        "simd/jidctfst-altivec.c",
+        "simd/jidctint-altivec.c",
+        "simd/jquanti-altivec.c",
+        "simd/jsimd_powerpc.c",
+        "simd/jsimd_altivec.h",
+        "simd/jcsample.h",
+    ],
+    hdrs = [
+      "simd/jdmrgext-altivec.c",  # should have been named .inc
+      "simd/jccolext-altivec.c",  # should have been named .inc
+      "simd/jcgryext-altivec.c",  # should have been named .inc
+      "simd/jdcolext-altivec.c",  # should have been named .inc
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
 cc_library(
     name = "simd_x86_64",
     srcs = [
@@ -381,6 +425,7 @@ genrule(
         ":k8": "cp $(location jconfig_nowin_simd.h) $@",
         ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
         ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
+        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
         "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
     }),
 )
@@ -498,3 +543,9 @@ config_setting(
     name = "windows_msvc",
     values = {"cpu": "x64_windows_msvc"},
 )
+
+config_setting(
+   name = "linux_ppc64le",
+   values = {"cpu": "ppc"},
+
+)
diff --git a/third_party/kafka/BUILD b/third_party/kafka/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a61a9e1f6c2b29ad3b992e810c0cab463dfd7feb
--- /dev/null
+++ b/third_party/kafka/BUILD
@@ -0,0 +1,147 @@
+# Description:
+#   Kafka C/C++ (librdkafka) client library
+
+licenses(["notice"])  # 2-clause BSD license
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "kafka",
+    srcs = [
+        "config.h",
+        "src-cpp/ConfImpl.cpp",
+        "src-cpp/ConsumerImpl.cpp",
+        "src-cpp/HandleImpl.cpp",
+        "src-cpp/KafkaConsumerImpl.cpp",
+        "src-cpp/MessageImpl.cpp",
+        "src-cpp/MetadataImpl.cpp",
+        "src-cpp/QueueImpl.cpp",
+        "src-cpp/RdKafka.cpp",
+        "src-cpp/TopicImpl.cpp",
+        "src-cpp/TopicPartitionImpl.cpp",
+        "src/crc32c.c",
+        "src/crc32c.h",
+        "src/lz4.c",
+        "src/lz4.h",
+        "src/lz4frame.c",
+        "src/lz4frame.h",
+        "src/lz4frame_static.h",
+        "src/lz4hc.c",
+        "src/lz4hc.h",
+        "src/lz4opt.h",
+        "src/queue.h",
+        "src/rd.h",
+        "src/rdaddr.c",
+        "src/rdaddr.h",
+        "src/rdatomic.h",
+        "src/rdavg.h",
+        "src/rdavl.c",
+        "src/rdavl.h",
+        "src/rdbuf.c",
+        "src/rdbuf.h",
+        "src/rdcrc32.h",
+        "src/rddl.h",
+        "src/rdendian.h",
+        "src/rdgz.c",
+        "src/rdgz.h",
+        "src/rdinterval.h",
+        "src/rdkafka.c",
+        "src/rdkafka.h",
+        "src/rdkafka_assignor.c",
+        "src/rdkafka_assignor.h",
+        "src/rdkafka_broker.c",
+        "src/rdkafka_broker.h",
+        "src/rdkafka_buf.c",
+        "src/rdkafka_buf.h",
+        "src/rdkafka_cgrp.c",
+        "src/rdkafka_cgrp.h",
+        "src/rdkafka_conf.c",
+        "src/rdkafka_conf.h",
+        "src/rdkafka_event.h",
+        "src/rdkafka_feature.c",
+        "src/rdkafka_feature.h",
+        "src/rdkafka_int.h",
+        "src/rdkafka_interceptor.c",
+        "src/rdkafka_interceptor.h",
+        "src/rdkafka_lz4.c",
+        "src/rdkafka_lz4.h",
+        "src/rdkafka_metadata.c",
+        "src/rdkafka_metadata.h",
+        "src/rdkafka_metadata_cache.c",
+        "src/rdkafka_msg.c",
+        "src/rdkafka_msg.h",
+        "src/rdkafka_msgset.h",
+        "src/rdkafka_msgset_reader.c",
+        "src/rdkafka_msgset_writer.c",
+        "src/rdkafka_offset.c",
+        "src/rdkafka_offset.h",
+        "src/rdkafka_op.c",
+        "src/rdkafka_op.h",
+        "src/rdkafka_partition.c",
+        "src/rdkafka_partition.h",
+        "src/rdkafka_pattern.c",
+        "src/rdkafka_pattern.h",
+        "src/rdkafka_proto.h",
+        "src/rdkafka_queue.c",
+        "src/rdkafka_queue.h",
+        "src/rdkafka_range_assignor.c",
+        "src/rdkafka_request.c",
+        "src/rdkafka_request.h",
+        "src/rdkafka_roundrobin_assignor.c",
+        "src/rdkafka_sasl.c",
+        "src/rdkafka_sasl.h",
+        "src/rdkafka_sasl_int.h",
+        "src/rdkafka_sasl_plain.c",
+        "src/rdkafka_subscription.c",
+        "src/rdkafka_subscription.h",
+        "src/rdkafka_timer.c",
+        "src/rdkafka_timer.h",
+        "src/rdkafka_topic.c",
+        "src/rdkafka_topic.h",
+        "src/rdkafka_transport.c",
+        "src/rdkafka_transport.h",
+        "src/rdkafka_transport_int.h",
+        "src/rdlist.c",
+        "src/rdlist.h",
+        "src/rdlog.c",
+        "src/rdlog.h",
+        "src/rdports.c",
+        "src/rdports.h",
+        "src/rdposix.h",
+        "src/rdrand.c",
+        "src/rdrand.h",
+        "src/rdregex.c",
+        "src/rdregex.h",
+        "src/rdstring.c",
+        "src/rdstring.h",
+        "src/rdsysqueue.h",
+        "src/rdtime.h",
+        "src/rdtypes.h",
+        "src/rdunittest.c",
+        "src/rdunittest.h",
+        "src/rdvarint.c",
+        "src/rdvarint.h",
+        "src/snappy.c",
+        "src/snappy.h",
+        "src/tinycthread.c",
+        "src/tinycthread.h",
+        "src/xxhash.c",
+        "src/xxhash.h",
+    ],
+    hdrs = [
+        "config.h",
+    ],
+    defines = [
+    ],
+    includes = [
+        "src",
+        "src-cpp",
+    ],
+    linkopts = [
+        "-lpthread",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@boringssl//:ssl",
+    ],
+)
diff --git a/third_party/kafka/config.patch b/third_party/kafka/config.patch
new file mode 100644
index 0000000000000000000000000000000000000000..fa5c2d35b408d4ef70835a41ade3ae6ec4823c0e
--- /dev/null
+++ b/third_party/kafka/config.patch
@@ -0,0 +1,44 @@
+diff -Naur a/config.h b/config.h
+--- a/config.h	1970-01-01 00:00:00.000000000 +0000
++++ b/config.h	2017-10-28 00:57:03.316957390 +0000
+@@ -0,0 +1,40 @@
++#pragma once
++#define WITHOUT_OPTIMIZATION 0
++#define ENABLE_DEVEL 0
++#define ENABLE_REFCNT_DEBUG 0
++#define ENABLE_SHAREDPTR_DEBUG 0
++
++#define HAVE_ATOMICS_32 1
++#define HAVE_ATOMICS_32_SYNC 1
++
++#if (HAVE_ATOMICS_32)
++# if (HAVE_ATOMICS_32_SYNC)
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++#define HAVE_ATOMICS_64 1
++#define HAVE_ATOMICS_64_SYNC 1
++
++#if (HAVE_ATOMICS_64)
++# if (HAVE_ATOMICS_64_SYNC)
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++
++#define WITH_ZLIB 1
++#define WITH_LIBDL 1
++#define WITH_PLUGINS 0
++#define WITH_SNAPPY 1
++#define WITH_SOCKEM 1
++#define WITH_SSL 1
++#define WITH_SASL 0
++#define WITH_SASL_SCRAM 0
++#define WITH_SASL_CYRUS 0
++#define HAVE_REGEX 1
++#define HAVE_STRNDUP 1
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 5344525ba8b42e8a3dbcf42397458d190a77f9d3..a9e1341a03c2e725e96bd7c8cbd7b09853bb8af4 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -670,6 +670,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "aggressive_inst_combine",
+    srcs = glob([
+        "lib/Transforms/AggressiveInstCombine/*.c",
+        "lib/Transforms/AggressiveInstCombine/*.cpp",
+        "lib/Transforms/AggressiveInstCombine/*.inc",
+        "lib/Transforms/AggressiveInstCombine/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/AggressiveInstCombine/*.h",
+        "include/llvm/Transforms/AggressiveInstCombine/*.def",
+        "include/llvm/Transforms/AggressiveInstCombine/*.inc",
+    ]),
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
 cc_library(
     name = "analysis",
     srcs = glob([
@@ -1405,6 +1427,7 @@ cc_library(
         "include/llvm/Transforms/IPO/*.inc",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":bit_reader",
         ":bit_writer",
@@ -1931,6 +1954,7 @@ cc_library(
         "include/llvm/Transforms/IPO/SCCP.h",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":config",
         ":core",
diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index e2cdec40295d369548ff26e3493b5d2300041916..3a8e7a10b43debb5eeca690a64d5795de998a3ac 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # BSD
 
-exports_files(["COPYING"])
+exports_files(["LICENCE"])
 
 cc_library(
     name = "pcre",
diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch
deleted file mode 100644
index af74798f0678d84d26681e947cef416a79090aa0..0000000000000000000000000000000000000000
--- a/third_party/protobuf/add_noinlines.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc
---- a/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-02-10 23:55:34.000000000 +0100
-+++ b/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-03-21 13:41:46.931979154 +0100
-@@ -557,7 +557,7 @@
-         "      $metadata$, $enum_descriptors$, $service_descriptors$);\n"
-         "}\n"
-         "\n"
--        "void protobuf_AssignDescriptorsOnce() {\n"
-+        "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n"
-         "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-         "  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n"
-         "}\n"
-@@ -656,7 +656,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void InitDefaults() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n"
-       "}\n");
-@@ -737,7 +737,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void AddDescriptors() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n"
-       "}\n");
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index c16eb3a12a86f3c2eb3813f5c8c7631fec8e97c6..954f21f5f8fe8029c869f8870464a750cfc8a3db 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -118,7 +118,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
diff --git a/third_party/tensorrt/BUILD b/third_party/tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..feaeb0bea63a982139d34f7c5f37a3f4c66d2af5
--- /dev/null
+++ b/third_party/tensorrt/BUILD.tpl
@@ -0,0 +1,67 @@
+# NVIDIA TensorRT
+# A high-performance deep learning inference optimizer and runtime.
+
+licenses(["notice"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "tensorrt_headers",
+    hdrs = [%{tensorrt_headers}],
+    includes = [
+        "include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_infer",
+    srcs = [%{nv_infer}],
+    data = [%{nv_infer}],
+    includes = [
+        "include",
+    ],
+    copts= cuda_default_copts(),
+    deps = [
+        "@local_config_cuda//cuda:cuda",
+        ":tensorrt_headers",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_infer_plugin",
+    srcs = [%{nv_infer_plugin}],
+    data = [%{nv_infer_plugin}],
+    includes = [
+        "include",
+    ],
+    copts= cuda_default_copts(),
+    deps = [
+        "@local_config_cuda//cuda:cuda",
+        ":nv_infer",
+        ":tensorrt_headers",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_parsers",
+    srcs = [%{nv_parsers}],
+    data = [%{nv_parsers}],
+    includes = [
+        "include",
+    ],
+    copts= cuda_default_copts(),
+    deps = [
+        ":tensorrt_headers",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+%{tensorrt_genrules}
diff --git a/third_party/tensorrt/build_defs.bzl.tpl b/third_party/tensorrt/build_defs.bzl.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..0dc3a7ba2d239cbeca5181ba20d0c98edb26bb94
--- /dev/null
+++ b/third_party/tensorrt/build_defs.bzl.tpl
@@ -0,0 +1,7 @@
+# Build configurations for TensorRT.
+
+def if_tensorrt(if_true, if_false=[]):
+  """Tests whether TensorRT was enabled during the configure process."""
+  if %{tensorrt_is_configured}:
+    return if_true
+  return if_false
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..8aa0f28f39d4dd8e24d5f162bf6535edbb030ee6
--- /dev/null
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -0,0 +1,224 @@
+# -*- Python -*-
+"""Repository rule for TensorRT configuration.
+
+`tensorrt_configure` depends on the following environment variables:
+
+  * `TF_TENSORRT_VERSION`: The TensorRT libnvinfer version.
+  * `TENSORRT_INSTALL_PATH`: The installation path of the TensorRT library.
+"""
+
+load(
+    "//third_party/gpus:cuda_configure.bzl",
+    "auto_configure_fail",
+    "get_cpu_value",
+    "find_cuda_define",
+    "matches_version",
+    "symlink_genrule_for_dir",
+)
+
+_TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
+_TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
+
+_TF_TENSORRT_LIBS = ["nvinfer", "nvinfer_plugin", "nvparsers"]
+_TF_TENSORRT_HEADERS = [
+    "NvInfer.h", "NvInferPlugin.h", "NvCaffeParser.h", "NvUffParser.h",
+    "NvUtils.h"
+]
+
+_DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
+_DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
+_DEFINE_TENSORRT_SONAME_PATCH = "#define NV_TENSORRT_SONAME_PATCH"
+
+
+def _headers_exist(repository_ctx, path):
+  """Returns whether all TensorRT header files could be found in 'path'.
+
+  Args:
+    repository_ctx: The repository context.
+    path: The TensorRT include path to check.
+
+  Returns:
+    True if all TensorRT header files can be found in the path.
+  """
+  for h in _TF_TENSORRT_HEADERS:
+    if not repository_ctx.path("%s/%s" % (path, h)).exists:
+      return False
+  return True
+
+
+def _find_trt_header_dir(repository_ctx, trt_install_path):
+  """Returns the path to the directory containing headers of TensorRT.
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library install directory.
+
+  Returns:
+    The path of the directory containing the TensorRT header.
+  """
+  if trt_install_path == "/usr/lib/x86_64-linux-gnu":
+    path = "/usr/include/x86_64-linux-gnu"
+    if _headers_exist(repository_ctx, path):
+      return path
+  path = str(repository_ctx.path("%s/../include" % trt_install_path).realpath)
+  if _headers_exist(repository_ctx, path):
+    return path
+  auto_configure_fail(
+      "Cannot find NvInfer.h with TensorRT install path %s" % trt_install_path)
+
+
+def _trt_lib_version(repository_ctx, trt_install_path):
+  """Detects the library (e.g. libnvinfer) version of TensorRT.
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library install directory.
+
+  Returns:
+    A string containing the library version of TensorRT.
+  """
+  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+  major_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_MAJOR)
+  minor_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_MINOR)
+  patch_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_PATCH)
+  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+  environ_version = repository_ctx.os.environ[_TF_TENSORRT_VERSION].strip()
+  if not matches_version(environ_version, full_version):
+    auto_configure_fail(
+        ("TensorRT library version detected from %s/%s (%s) does not match " +
+         "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
+        (trt_header_dir, "NvInfer.h", full_version, environ_version))
+  return environ_version
+
+
+def _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version):
+  """Finds the given TensorRT library on the system.
+
+  Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library installation directory.
+    trt_lib_version: The version of TensorRT library files as returned
+      by _trt_lib_version.
+
+  Returns:
+    Map of library names to structs with the following fields:
+      src_file_path: The full path to the library found on the system.
+      dst_file_name: The basename of the target library.
+  """
+  objdump = repository_ctx.which("objdump")
+  result = {}
+  for lib in _TF_TENSORRT_LIBS:
+    dst_file_name = "lib%s.so.%s" % (lib, trt_lib_version)
+    src_file_path = repository_ctx.path("%s/%s" % (trt_install_path,
+                                                   dst_file_name))
+    if not src_file_path.exists:
+      auto_configure_fail(
+          "Cannot find TensorRT library %s" % str(src_file_path))
+    if objdump != None:
+      objdump_out = repository_ctx.execute([objdump, "-p", str(src_file_path)])
+      for line in objdump_out.stdout.splitlines():
+        if "SONAME" in line:
+          dst_file_name = line.strip().split(" ")[-1]
+    result.update({
+        lib:
+            struct(
+                dst_file_name=dst_file_name,
+                src_file_path=str(src_file_path.realpath))
+    })
+  return result
+
+
+def _tpl(repository_ctx, tpl, substitutions):
+  repository_ctx.template(tpl, Label("//third_party/tensorrt:%s.tpl" % tpl),
+                          substitutions)
+
+
+def _create_dummy_repository(repository_ctx):
+  """Create a dummy TensorRT repository."""
+  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "False"})
+  substitutions = {
+      "%{tensorrt_genrules}": "",
+      "%{tensorrt_headers}": "",
+  }
+  for lib in _TF_TENSORRT_LIBS:
+    k = "%%{%s}" % lib.replace("nv", "nv_")
+    substitutions.update({k: ""})
+  _tpl(repository_ctx, "BUILD", substitutions)
+
+
+def _tensorrt_configure_impl(repository_ctx):
+  """Implementation of the tensorrt_configure repository rule."""
+  if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
+    _create_dummy_repository(repository_ctx)
+    return
+
+  if (get_cpu_value(repository_ctx) != "Linux"):
+    auto_configure_fail("TensorRT is supported only on Linux.")
+  if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
+    auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
+  trt_install_path = repository_ctx.os.environ[_TENSORRT_INSTALL_PATH].strip()
+  if not repository_ctx.path(trt_install_path).exists:
+    auto_configure_fail(
+        "Cannot find TensorRT install path %s." % trt_install_path)
+
+  # Set up the symbolic links for the library files.
+  trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
+  trt_libs = _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version)
+  trt_lib_src = []
+  trt_lib_dest = []
+  for lib in trt_libs.values():
+    trt_lib_src.append(lib.src_file_path)
+    trt_lib_dest.append(lib.dst_file_name)
+  genrules = [
+      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/lib/",
+                              "tensorrt_lib", trt_lib_src, trt_lib_dest)
+  ]
+
+  # Set up the symbolic links for the header files.
+  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+  src_files = [
+      "%s/%s" % (trt_header_dir, header) for header in _TF_TENSORRT_HEADERS
+  ]
+  dest_files = _TF_TENSORRT_HEADERS
+  genrules.append(
+      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/include/",
+                              "tensorrt_include", src_files, dest_files))
+
+  # Set up config file.
+  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "True"})
+
+  # Set up BUILD file.
+  substitutions = {
+      "%{tensorrt_genrules}": "\n".join(genrules),
+      "%{tensorrt_headers}": '":tensorrt_include"',
+  }
+  for lib in _TF_TENSORRT_LIBS:
+    k = "%%{%s}" % lib.replace("nv", "nv_")
+    v = '"tensorrt/lib/%s"' % trt_libs[lib].dst_file_name
+    substitutions.update({k: v})
+  _tpl(repository_ctx, "BUILD", substitutions)
+
+
+tensorrt_configure = repository_rule(
+    implementation=_tensorrt_configure_impl,
+    environ=[
+        _TENSORRT_INSTALL_PATH,
+        _TF_TENSORRT_VERSION,
+    ],
+)
+"""Detects and configures the local CUDA toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+tensorrt_configure(name = "local_config_tensorrt")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/termcolor.BUILD b/third_party/termcolor.BUILD
index 6000e3289deff8183193883a9b796da9384365b8..655d7cb85e584027d12014c53718a15e2522b4ae 100644
--- a/third_party/termcolor.BUILD
+++ b/third_party/termcolor.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # MIT
 
-exports_files(["LICENSE"])
+exports_files(["COPYING.txt"])
 
 py_library(
     name = "termcolor",